rough draft of paper done.

2023-06-07 20:44:48 -07:00
parent 245f60a7a8
commit 7edb8543a7
38 changed files with 1130 additions and 388 deletions
--- a/src/data/init.py
+++ b/src/data/init.py
@@ -2,9 +2,24 @@ import data.main
 import data.scrape
 import data.factcheck
 import data.links
+import data.bias
+import data.emotion
+import data.broken_links
+import data.selection
+import data.sentence
+import data.sentiment
+import data.word
+
 __all__ = [
    'main'
    ,'scrape'
    ,'factcheck'
    ,'links'
+    ,'bias'
+    ,'emotion'
+    ,'broken_links'
+    ,'selection'
+    ,'sentence'
+    ,'sentiment'
+    ,'word'
 ]
--- a/src/data/bias.py
+++ b/src/data/bias.py
@@ -0,0 +1,146 @@
+import click
+from data.main import connect, paths
+import pandas as pd
+from lxml import etree
+from pathlib import Path
+import os
+import csv
+
+
+@click.command(name="bias:normalize")
+def normalize() -> None:
+    with connect() as db:
+        db.sql("""
+            CREATE OR REPLACE TABLE publisher_bias AS
+            WITH cte AS (
+                SELECT
+                    p.id as publisher_id
+                    ,b.id as bias_id
+                    ,b.bias as label
+                    ,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity
+                FROM bias_ratings b
+                JOIN top.publishers p
+                ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95
+            ),ranked AS (
+                SELECT
+                    publisher_id
+                    ,bias_id
+                    ,label
+                    ,similarity
+                    ,ROW_NUMBER() OVER(PARTITION BY publisher_id ORDER BY similarity DESC) AS rn
+                FROM cte
+            )
+            SELECT
+                publisher_id
+                ,label
+                ,bias_id
+            FROM ranked
+            WHERE ranked.rn = 1
+        """)
+
+    mapping = [
+            {'label' :'left' , 'ordinal': -2},
+            {'label' :'left-center' , 'ordinal': -1},
+            {'label' :'center' , 'ordinal': 0},
+            {'label' :'right-center' , 'ordinal': 1},
+            {'label' :'right' , 'ordinal': 2},
+    ]
+    mapping = pd.DataFrame(mapping)
+
+    with connect() as db:
+        db.query("alter table bias_ratings add column ordinal int")
+        db.query("""
+            update bias_ratings b
+            set ordinal = o.ordinal
+            FROM mapping o
+            WHERE o.label = b.bias
+        """)
+
+
+@click.command(name='bias:parse')
+def parse() -> None:
+    """parse the save html page of allslides.com bias ratings into a normalized csv file"""
+    bias_html = paths('data') / 'allsides.html'
+
+    parser = etree.HTMLParser()
+    tree = etree.parse(str(bias_html), parser)
+    root = tree.getroot()
+    rows = root.xpath('//table[contains(@class,"views-table")]/tbody/tr')
+
+    ratings = []
+    for row in rows:
+        rating = dict()
+        publisher = row.xpath('./td[contains(@class, "source-title")]/a')[0].text
+        rating['publisher'] = publisher
+
+        bias = row.xpath('./td[contains(@class, "views-field-field-bias-image")]/a')[0].get('href')
+        bias = bias.split('/')[-1]
+        rating['bias'] = bias
+
+        agree = row.xpath('.//span[contains(@class, "agree")]')[0].text
+        disagree = row.xpath('.//span[contains(@class, "disagree")]')[0].text
+
+        rating['agree'] = int(agree)
+        rating['disagree'] = int(disagree)
+        ratings.append(rating)
+    df = pd.DataFrame(ratings)
+    df.to_csv(paths('data') / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
+
+@click.command(name="bias:load")
+def load() -> None:
+    f = str(paths('data') / "bias_ratings.csv")
+
+    with connect() as db:
+        db.sql(f"""
+            CREATE TABLE bias_ratings as 
+            select 
+                row_number() over(order by b.publisher) as id
+                ,b.*
+            from read_csv_auto('{f}') b
+        """)
+
+@click.command('bias:export')
+def export():
+    with connect() as db:
+        all_bias = db.query("""
+            SELECT
+                id as bias_id
+                ,publisher as name
+                ,bias as label
+            FROM bias_ratings
+            ORDER by agree desc
+        """)
+
+    all_bias.df().to_csv(paths('data') / 'TMP_publisher_bias.csv', sep="|", index=False)
+    with connect() as db:
+        mapped_bias = db.query("""
+            SELECT
+                p.id as publisher_id
+                ,p.name as name
+                ,p.tld as tld
+                ,b.label as bias
+                ,b.bias_id as bias_id
+            FROM top.publishers p
+            LEFT JOIN publisher_bias b
+            ON b.publisher_id = p.id
+    """)
+    mapped_bias.df().to_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|", index=False)
+
+@click.command('bias:import-mapped')
+def import_mapped():
+    table_name = "top.publisher_bias"
+
+    df = pd.read_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|")
+
+    with connect() as db:
+        db.query(f"""
+            CREATE OR REPLACE TABLE {table_name} AS
+            SELECT
+                publisher_id AS publisher_id
+                ,cast(bias_id AS int) as bias_id
+            FROM df
+            WHERE bias_id IS NOT NULL
+        """)
+
+    print(f"created table: {table_name}")
+
--- a/src/data/broken_links.py
+++ b/src/data/broken_links.py
@@ -0,0 +1,40 @@
+import requests
+import seaborn as sns
+import matplotlib.pyplot as plt
+import click
+
+from data.main import connect
+
+@click.command(name="broken:crawl")
+def crawl():
+    """crawl story urls checking for link rot or redirects."""
+    with connect() as db:
+        urls = db.query("""
+            select 
+                id
+                ,url
+            from stories 
+            order by published_at asc
+            limit 5
+        """).fetchall()
+
+    story_id, url = urls[1]
+    # url
+    responses = []
+    for story_id, url in urls:
+        out = {'story_id' : story_id, 'final_url' : url, 'timeout' : 0, 'status_code' : 200, 'content_length' : 0}
+        try:
+            response = requests.get(url, verify=False, timeout=10)
+            if len(response.history) > 1:
+                out['redirect'] = 1
+            if url != response.url:
+                out['final_url'] = response.url
+            out['status_code'] = response.status_code
+            out['content_length'] = len(response.content)
+        except requests.exceptions.ReadTimeout as e:
+            print(f"timeout: {url}")
+            out['timeout'] = 1
+        responses.append(out)
+
+    sns.histplot(x=hist['cnt'])
+    plt.show()
--- a/src/data/emotion.py
+++ b/src/data/emotion.py
@@ -0,0 +1,484 @@
+import click
+from tqdm import tqdm
+import torch
+import pandas as pd
+import numpy as np
+
+from transformers import BertTokenizer
+from train.model import BertForMultiLabelClassification
+from data.main import connect, data_dir
+import seaborn as sns
+import matplotlib.pyplot as plt
+from matplotlib.dates import DateFormatter
+import matplotlib.dates as mdates
+
+def data():
+
+    # load data
+    DB = connect()
+    table = DB.sql("""
+        SELECT
+            id,
+            title
+        FROM stories
+        WHERE id NOT IN (
+            SELECT 
+                DISTINCT story_id 
+            FROM story_emotions
+        )
+        ORDER BY id DESC
+    """).df()
+    DB.close()
+
+    return table
+
+@click.command("emotion:create-table")
+def create_table():
+    """create the table to hold the title id and labels."""
+    DB = connect()
+    table = "story_emotions"
+    DB.execute("""
+        CREATE OR REPLACE TABLE {table}
+        (
+            story_id BIGINT,
+            label TEXT,
+            score REAL
+        )
+    """)
+    DB.close()
+    print(f"\"{table}\" created")
+
+@click.command("emotion:extract")
+@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
+def extract(chunks):
+    """extract emotion class labels from titles and put them in the db"""
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")
+    model = BertForMultiLabelClassification.from_pretrained("monologg/bert-base-cased-goemotions-original")
+    model.to(device)
+
+    table = data()
+    chunked = np.array_split(table.to_numpy(), chunks)
+    for part in tqdm(chunked):
+        ids = [x[0] for x in part]
+        docs = [x[1] for x in part]
+        tokens = tokenizer(docs, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
+        tokens = tokens.to(device)
+        results = run(model, tokens, ids)
+        df = pd.DataFrame(results)
+        DB = connect()
+        DB.execute('INSERT INTO story_emotions SELECT * FROM df')
+        DB.close()
+
+def run(model, tokens, ids):
+    threshold = 0.1
+    with torch.no_grad():
+        outputs = model(**tokens)[0].to('cpu').detach().numpy()
+    scores = 1 / (1 + np.exp(-outputs))  # Sigmoid
+    results = []
+    for i, item in enumerate(scores):
+        for idx, s in enumerate(item):
+            if s > threshold:
+                results.append({"story_id": ids[i], "label" : model.config.id2label[idx], "score": s})
+    return results
+
+@click.command("emotion:normalize")
+def normalize():
+    """normalize the emotion tables."""
+    DB = connect()
+    DB.sql("""
+        CREATE OR REPLACE TABLE emotions AS
+        SELECT
+            row_number() over() as id
+            ,e.label
+            ,COUNT(1) AS stories
+        FROM story_emotions e
+        JOIN stories s
+        ON s.id = e.story_id
+        -- WHERE YEAR(s.published_at) < 2022
+        GROUP BY e.label
+        HAVING stories > 1000
+        ORDER BY stories DESC
+    """)
+    DB.sql("""
+        ALTER TABLE story_emotions
+        ADD COLUMN emotion_id int64
+    """)
+    DB.sql("""
+        UPDATE story_emotions
+        SET emotion_id = emotions.id
+        FROM emotions
+        WHERE emotions.label = story_emotions.label
+    """)
+    DB.sql("""
+        ALTER TABLE story_emotions
+        DROP COLUMN label
+    """)
+
+    DB.sql("""
+        SELECT
+            row_number() over() as id
+            ,e.label
+            ,COUNT(1) AS stories
+        FROM story_emotions e
+        JOIN stories s
+        ON s.id = e.story_id
+        -- WHERE YEAR(s.published_at) < 2022
+        GROUP BY e.label
+        HAVING stories > 1000
+        ORDER BY stories DESC
+    """)
+    DB.close()
+
+@click.command("emotion:analyze")
+def coef_over_time():
+    """plot and group emotional labels"""
+    DB = connect()
+
+    emotions = DB.sql("""
+        select label from emotions
+    """).df()
+
+    from sklearn import linear_model
+    from sklearn.model_selection import train_test_split
+
+    def results(buckets = '1 month'):
+        results = DB.sql(f"""
+            with cte as (
+                SELECT
+                    time_bucket(interval '{buckets}', s.published_at) as date
+                    ,e.label
+                    ,COUNT(1) AS stories
+                FROM stories s
+                JOIN story_emotions se
+                ON s.id = se.story_id
+                JOIN emotions e
+                ON e.id = se.emotion_id
+                WHERE YEAR(s.published_at) < 2022
+                GROUP BY 
+                    time_bucket(interval '{buckets}', s.published_at)
+                    ,e.label
+            )
+            ,total as (
+                SELECT
+                    time_bucket(interval '{buckets}', s.published_at) as date
+                    ,COUNT(1) AS stories
+                FROM stories s
+                WHERE YEAR(s.published_at) < 2022
+                GROUP BY 
+                    time_bucket(interval '{buckets}', s.published_at)
+            )
+            select
+                epoch(cte.date) / 60 / 60 / 24 / 365 as date
+                ,cte.label
+                ,cast(cte.stories as float) / t.stories as stories
+            from cte
+            join total t
+            on t.date = cte.date
+        """).df()
+        return results
+
+
+    def get_coef(label):
+        reg = linear_model.LinearRegression()
+        df = results[results['label'] == label]
+        x = df['date'].to_numpy().reshape(-1, 1)
+        y = df['stories']
+        x_train, x_test = train_test_split(x)
+        y_train, y_test = train_test_split(y)
+        reg.fit(x_train, y_train)
+        # y_pred = reg.predict(x_test)
+        # sns.lineplot(x=x_test.flatten(), y=y_pred)
+        return reg.coef_
+
+    collection = []
+    results = results('2 year')
+    for emotion in emotions['label']:
+        if emotion == 'neutral':
+            continue
+        coef = get_coef(emotion)[0]
+        if coef > 0:
+            increasing = True
+        else:
+            increasing = False
+        collection.append({'emotion' : emotion, 'coef' : coef, 'increasing' : coef > 0 })
+    pd.DataFrame(collection).sort_values('coef')
+
+    plt.show()
+
+@click.command("emotion:analyze")
+def analyze():
+    """plot and group emotional labels"""
+    DB = connect()
+
+    emotions = DB.sql("""
+        select label from emotions
+    """).df()
+
+    from sklearn import linear_model
+    from sklearn.model_selection import train_test_split
+    def get_coef(emotion):
+        df = DB.sql("""
+            with cte as (
+                SELECT
+                    time_bucket(interval '1 month', s.published_at) as date
+                    ,e.label
+                    ,COUNT(1) AS stories
+                FROM stories s
+                JOIN story_emotions se
+                ON s.id = se.story_id
+                JOIN emotions e
+                ON e.id = se.emotion_id
+                WHERE YEAR(s.published_at) < 2022
+                --AND e.label in ('neutral', 'annoyance')
+                AND e.label in ('sadness')
+                GROUP BY 
+                    time_bucket(interval '1 month', s.published_at)
+                    ,e.label
+            )
+            ,total as (
+                SELECT
+                    time_bucket(interval '1 month', s.published_at) as date
+                    ,COUNT(1) AS stories
+                FROM stories s
+                WHERE YEAR(s.published_at) < 2022
+                GROUP BY 
+                    time_bucket(interval '1 month', s.published_at)
+            )
+            select
+                epoch(cte.date) as date
+                ,cte.label
+                --,total.stories as total
+                ,cast(cte.stories as float) / e.stories as stories
+            from cte
+            join emotions e
+            --on total.date = cte.date
+            on e.label = cte.label
+        """).df()
+
+        reg = linear_model.LinearRegression()
+        x = df['date'].to_numpy().reshape(-1, 1)
+        y = df['stories']
+
+        x_train, x_test = train_test_split(x)
+        y_train, y_test = train_test_split(y)
+        reg.fit(x_train, y_train)
+        #y_pred = reg.predict(x_test)
+        return reg.coef_
+
+
+    df = DB.sql(f"""{yearly}""").df()
+    df['date'] = pd.to_datetime(df['date'])
+    ax = sns.lineplot(x=df['date'], y=df['stories'], hue=df['label'])
+    #ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
+    plt.locator_params(axis='y', nbins=6)
+    ax.xaxis.set_major_formatter(DateFormatter("%m-%y"))
+    plt.show()
+
+    DB.sql("""
+        WITH grouped as (
+        ), total AS (
+            SELECT
+                e.label
+                ,count(1) as total
+            FROM grouped s
+            JOIN story_emotions e
+            ON e.label = s.label
+            GROUP BY
+                e.label
+        )
+        SELECT
+            g.year
+            ,g.label
+            ,100 * (g.stories / CAST(t.total AS float)) AS frac
+        FROM grouped g
+        JOIN total t
+        ON t.label = g.label
+        ORDER BY g.label, g.year
+    """)
+    DB.close()
+
+    sns.lineplot(x=df['year'], y=df['frac'], hue=df['label'])
+    plt.show()
+
+def debug():
+    from transformers import pipeline
+
+    # load data
+    DB = connect()
+    table = DB.sql("""
+        SELECT
+            id,
+            title
+        FROM stories
+        ORDER BY id DESC
+    """).df()
+    DB.close()
+
+    classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")
+
+    chunks = 5000
+    chunked = np.array_split(table, chunks)
+    labels = []
+    ids = []
+    for chunk in tqdm(chunked):
+        sentences = chunk['title'].tolist()
+        label_ids = chunk['id'].tolist()
+        with torch.no_grad():
+            emotions = classifier(sentences)
+        labels.append(emotions)
+        ids.append(label_ids)
+    out = pd.DataFrame(np.concatenate(labels).tolist())
+    out_ids = pd.DataFrame(np.concatenate(ids).tolist(), columns=['story_id'])
+    out = pd.concat([out_ids, out], axis=1)
+
+    DB = connect()
+    DB.sql("""
+        CREATE OR REPLACE TABLE story_emotions AS
+        SELECT
+            story_id
+            ,label
+            ,score
+        FROM out
+    """)
+    DB.sql("""
+        CREATE OR REPLACE TABLE emotions AS
+        SELECT
+            row_number() over() as id
+            ,label
+            ,count(1) as stories
+        FROM story_emotions
+        GROUP BY
+            label
+    """)
+    DB.sql("""
+        ALTER TABLE story_emotions add emotion_id bigint
+    """)
+    DB.sql("""
+        UPDATE story_emotions
+        SET emotion_id = emotions.id
+        FROM emotions
+        WHERE story_emotions.label = emotions.label
+    """)
+    DB.sql("""
+        ALTER TABLE story_emotions drop column label
+    """)
+    DB.sql("""
+        select
+            *
+        from emotions
+    """)
+    DB.sql("""
+        select
+        * from story_emotions
+        limit 4
+    """)
+    DB.close()
+
+    out.to_csv(data_dir() / 'emotions.csv', sep="|")
+
+def another():
+    DB = connect()
+
+    DB.sql("""
+        select
+            *
+        from emotions
+    """)
+
+    DB.sql("""
+        select
+            *
+        from story_emotions
+    """)
+
+    emotions = DB.sql("""
+        SELECT
+            YEAR(s.published_at) AS year
+            ,e.label AS emotion
+            ,count(1) AS stories
+        FROM stories s
+        JOIN story_emotions se
+        ON s.id = se.story_id
+        JOIN emotions e
+        ON e.id = se.emotion_id
+        GROUP by
+            YEAR(s.published_at)
+            ,e.label
+    """).df()
+    emotions
+
+    sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
+    plt.show()
+
+    pivot = emotions.pivot(index='year', columns='emotion', values='stories')
+    pivot.reset_index(inplace=True)
+    from sklearn.linear_model import LinearRegression
+    reg = LinearRegression()
+
+    for emotion in pivot.keys()[1:].tolist():
+        _ = reg.fit(pivot['year'].to_numpy().reshape(-1, 1), pivot[emotion])
+        print(f"{emotion}: {reg.coef_[0]}")
+
+    fig, ax = plt.subplots()
+    #sns.lineplot(x=pivot['anger'], y=pivot['joy'])
+    #sns.lineplot(x=pivot['anger'], y=pivot['surprise'], ax=ax)
+    sns.lineplot(x=pivot['anger'], y=pivot['fear'], ax=ax)
+    sns.lineplot(x=pivot[''], y=pivot['fear'], ax=ax)
+    plt.show()
+
+    DB.close()
+
+    normalized = DB.sql("""
+        with cte as (
+            select
+                year(s.published_at) as year
+                ,se.label as emotion
+                ,b.label as bias
+            from stories s
+            join story_emotions se
+            on s.id = se.story_id
+            join publisher_bias b
+            on b.id = s.publisher_id
+            where b.label != 'allsides'
+            and se.label != 'neutral'
+        )
+        select
+            distinct
+            year
+            ,emotion
+            ,bias
+            ,cast(count(1) over(partition by year, bias, emotion) as float) / count(1) over(partition by year, bias) as group_count
+        from cte
+    """).df()
+
+    DB.sql("""
+        select
+            b.label as bias
+            ,count(1) as stories
+        from stories s
+        join story_emotions se
+        on s.id = se.story_id
+        join publisher_bias b
+        on b.id = s.publisher_id
+        group by
+            b.label
+    """).df()
+
+    another_pivot = emotional_bias.pivot(index=['bias', 'year'], columns='emotion', values='stories')
+    another_pivot.reset_index(inplace=True)
+
+    sns.lineplot(data=normalized, x='year', y='group_count', hue='bias', style='emotion')
+    plt.show()
+
+    sns.relplot(
+        data=normalized, x="year", y="group_count", hue="emotion", col='bias', kind="line"
+        #data=normalized, x="year", y="group_count", hue="emotion", col='bias', kind="line", facet_kws=dict(sharey=False)
+    )
+    plt.show()
+
+    DB.sql("""
+    select
+    *
+    from another_pivot
+    """)
--- a/src/data/factcheck.py
+++ b/src/data/factcheck.py
@@ -8,7 +8,7 @@ from pathlib import Path
 import os
 import sys
 import click
-from data.main import connect, map_tld, paths
+from data.main import connect, map_tld, paths, reporting_label_to_int
 from random import randint
 from time import sleep
 from tqdm import tqdm
@@ -155,7 +155,7 @@ def create_tables():
            FROM stories s
        """).df()

-    stories['tld'] = stories.url.apply(map_tld)
+    raw_stories['tld'] = raw_stories.url.apply(map_tld)
    
    with connect() as db:
        db.sql("""
@@ -167,5 +167,25 @@ def create_tables():
            JOIN mbfc.publishers p
            ON p.tld = s.tld
        """)
+    with connect() as db:
+        data = db.sql("""
+            select
+                id,
+                reporting
+            from mbfc.publishers p
+        """).df()

+    with connect() as db:
+        db.sql("""
+            alter table mbfc.publishers add column reporting_ordinal int
+        """)

+    data['ordinal'] = data.reporting.apply(reporting_label_to_int)
+
+    with connect() as db:
+        db.sql("""
+            update mbfc.publishers
+            set reporting_ordinal = data.ordinal
+            from data
+            where data.id = publishers.id
+        """)
--- a/src/data/main.py
+++ b/src/data/main.py
@@ -22,6 +22,8 @@ def paths(name='app'):
        return Path(os.environ['DATA_MINING_DOCS_DIR'])
    if 'figure' in name:
        return Path(os.environ['DATA_MINING_DOCS_DIR']) / 'figures'
+    if 'model' in name:
+        return Path(os.environ['DATA_MINING_DATA_DIR']) / 'models'

 def connect():
    DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
@@ -105,3 +107,32 @@ def bias_int_to_label(class_id: int, source: str = 'mbfc') -> str:
    except:
        print(f"no mapping for {class_id}", file=sys.stderr)
        return -1
+
+def reporting_label_to_int(label):
+    mapping = {
+        'Very Low': 0,
+        'Low': 1,
+        'Mixed': -1,
+        'Mostly Factual': 3,
+        'High': 4,
+        'Very High': 5
+    }
+    try:
+        return mapping[label]
+    except:
+        return -1
+
+def save_model(model, name):
+    import pickle
+    save_to = paths('models') / name
+    with open(save_to, 'wb') as file:
+        pickle.dump(model, file)
+    print(f"saved model: {save_to}")
+
+def load_model(name):
+    import pickle
+    open_from = paths('models') / name
+    print(f"loading model: {open_from}")
+    with open(open_from, 'rb') as file:
+        model = pickle.load(file)
+    return model
--- a/src/data/sentence.py
+++ b/src/data/sentence.py
@@ -0,0 +1,287 @@
+import click
+from transformers import AutoTokenizer, AutoModel
+import torch
+import torch.nn.functional as F
+from data.main import connect, paths, save_model, load_model, ticklabels
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+
+@click.option('-c', '--chunks', type=int, default=500, show_default=True)
+@click.command("sentence:embed")
+def embed(chunks):
+
+    # Load model from HuggingFace Hub
+    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+
+    # load data
+    with connect() as db:
+        table = db.sql("""
+            select
+            id
+            ,title
+            from stories
+            order by id desc
+        """).df()
+
+    # normalize text
+    table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
+
+
+    chunked = np.array_split(table, chunks)
+    # generate embeddings from list of titles
+    iterator = tqdm(chunked, 'embedding')
+    embeddings = []
+    embedding_ids = []
+    for _, chunk in enumerate(iterator):
+        sentences = chunk['title'].tolist()
+        ids = chunk['id'].tolist()
+        # Tokenize sentences
+        encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+        # Compute token embeddings
+        with torch.no_grad():
+            model_output = model(**encoded_input)
+        # Perform pooling
+        output = mean_pooling(model_output, encoded_input['attention_mask'])
+        # Normalize embeddings
+        output = F.normalize(output, p=2, dim=1)
+        embeddings.append(output)
+        embedding_ids.append(ids)
+
+    embeddings = np.concatenate(embeddings)
+    ids = np.concatenate(embedding_ids)
+
+    # save embeddings
+    save_to = paths('data') / 'embeddings.npy'
+    np.save(save_to, embeddings)
+    print(f"embeddings saved: {save_to}")
+
+    # save ids
+    save_to = paths('data') / 'embedding_ids.npy'
+    np.save(save_to, ids)
+    print(f"ids saved: {save_to}")
+
+
+@click.command('sentence:create-avg-pca-table')
+def create_avg_pca_table():
+    from sklearn.decomposition import PCA
+
+
+    embeddings = np.load(paths('data') / 'embeddings.npy')
+    embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
+    ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
+
+    
+    with connect() as db:
+        data = db.query("""
+            SELECT
+                ids.index
+                ,s.id
+                ,s.publisher_id
+                ,b.ordinal
+            FROM ids
+            JOIN top.stories s
+            ON ids.story_id = s.id
+            JOIN top.publisher_bias pb
+            ON pb.publisher_id = s.publisher_id
+            JOIN bias_ratings b
+            ON b.id = pb.bias_id
+        """).df()
+
+    results = []
+    for publisher_id, group in data.groupby(['publisher_id']):
+        avg = embeddings[group['index']].mean(axis=0)
+        ordinal = group['ordinal'].iloc[0]
+        results.append({'publisher_id' : publisher_id, 'embedding' : avg, 'ordinal' : ordinal})
+    results = pd.DataFrame(results)
+
+    x = np.stack(results['embedding'])
+    y = results['ordinal']
+
+    model = PCA(n_components=2)
+    pred = model.fit_transform(x)
+    results['first'] = pred[:, 0]
+    results['second'] = pred[:, 1]
+
+    table_name = "top.publisher_embeddings_pca"
+    with connect() as db:
+        db.query(f"""
+            CREATE OR REPLACE TABLE {table_name} AS
+            SELECT
+                results.publisher_id as publisher_id
+                ,results.first as first
+                ,results.second as second
+            FROM results
+        """)
+
+    print(f"created {table_name}")
+
+
+@click.command('sentence:create-pca-table')
+def create_pca_table():
+    from sklearn.decomposition import PCA
+
+    embeddings = np.load(path('data') / 'embeddings.npy')
+    embedding_ids = np.load(path('data') / 'embedding_ids.npy')
+    ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
+
+    with connect() as db:
+        data = db.query("""
+            SELECT
+                ids.index
+                ,s.id
+                ,p.bias
+                ,p.ordinal
+            FROM ids
+            JOIN stories s
+            ON ids.story_id = s.id
+            JOIN mbfc.publisher_stories ps
+            ON s.id = ps.story_id
+            JOIN mbfc.publishers p
+            ON p.id = ps.publisher_id
+            WHERE p.ordinal != -1
+        """).df()
+        pub = db.query("""
+            SELECT
+                *
+            FROM mbfc.publishers
+        """).df()
+
+    x = embeddings[data['index']]
+    y = data['ordinal'].to_numpy().reshape(-1, 1)
+    model = PCA(n_components=2)
+    pred = model.fit_transform(x)
+    data['first'] = pred[:, 0]
+    data['second'] = pred[:, 1]
+
+    table_name = f"story_embeddings_pca"
+    with connect() as db:
+        db.query(f"""
+            CREATE OR REPLACE TABLE {table_name} AS
+            SELECT
+                data.id as story_id
+                ,data.first as first
+                ,data.second as second
+            FROM data
+        """)
+    print(f"created {table_name}")
+
+@click.command('sentence:create-svm-table')
+def create_svm_table():
+    """sentence to classifier"""
+
+    from sklearn import svm
+    from sklearn.linear_model import SGDClassifier
+
+    embeddings = np.load(paths('data') / 'embeddings.npy')
+    embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
+    ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
+
+    with connect() as db:
+        data = db.query("""
+            SELECT
+                ids.index
+                ,s.id
+                ,p.ordinal
+                ,p.bias
+            FROM ids
+            JOIN stories s
+            ON ids.story_id = s.id
+            JOIN mbfc.publisher_stories ps
+            ON s.id = ps.story_id
+            JOIN mbfc.publishers p
+            ON p.id = ps.publisher_id
+            WHERE p.ordinal != -1
+        """).df()
+
+    x = embeddings[data['index']]
+    y = data['ordinal']
+
+    model = SGDClassifier()
+    model = model.fit(x, y)
+    # data['pred'] = pred.predict(x)
+    save_model(model, 'sgdclassifier.pkl')
+
+def interence():
+
+    with connect() as db:
+         bias = db.query("""
+            SELECT
+                p.bias
+                ,p.ordinal
+            FROM mbfc.publishers p
+            WHERE p.ordinal != -1
+            GROUP BY
+                p.bias
+                ,p.ordinal
+            ORDER BY
+                p.ordinal
+        """).df()
+
+    sdg = load_model( 'sgdclassifier.pkl')
+
+
+    tokens = tokenizer(["hello, i hate woke culture.", "trump is winning"], padding=True, truncation=True, return_tensors='pt')
+
+    with torch.no_grad():
+        output = model(**tokens)
+
+    output = mean_pooling(output, tokens['attention_mask'])
+
+    output = F.normalize(output, p=2, dim=1)
+    sdg.predict(output)
+
+    tokens
+    dir(output)
+
+def validation():
+
+    from sklearn.model_selection import train_test_split
+    from sklearn.svm import LinearSVC
+    from sklearn.metrics import ConfusionMatrixDisplay
+    import matplotlib.pyplot as plt
+
+    embeddings = np.load(paths('data') / 'embeddings.npy')
+    embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
+    ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
+
+    with connect() as db:
+        data = db.query("""
+            SELECT
+                ids.index
+                ,s.id
+                ,p.ordinal
+                ,p.bias
+            FROM ids
+            JOIN stories s
+            ON ids.story_id = s.id
+            JOIN mbfc.publisher_stories ps
+            ON s.id = ps.story_id
+            JOIN mbfc.publishers p
+            ON p.id = ps.publisher_id
+            WHERE p.ordinal != -1
+        """).df()
+
+    x = embeddings[data['index']]
+    y = data['ordinal']
+
+
+    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
+
+    clf = LinearSVC()
+    clf.fit(x_train, y_train)
+
+
+    fig, ax = plt.subplots(figsize=(10, 5))
+    ConfusionMatrixDisplay.from_predictions(y_test, clf.predict(x_test), ax=ax)
+    ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels(), yticklabels=ticklabels())
+    plt.show()
+
+    plt.savefig(save_to)
--- a/src/data/sentiment.py
+++ b/src/data/sentiment.py
@@ -20,15 +20,14 @@ def extract(chunks):


    # load data
-    DB = connect()
-    table = DB.sql("""
-        select
-        id
-        ,title
-        from stories
-        order by id desc
-    """).df()
-    DB.close()
+    with connect() as db:
+        table = db.sql("""
+            select
+            id
+            ,title
+            from stories
+            order by id desc
+        """).df()

    # normalize text
    table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
@@ -56,12 +55,12 @@ def extract(chunks):
    story_ids = np.concatenate(story_ids)

    # save embeddings
-    save_to = data_dir() / 'sentiment.npy'
+    save_to = paths('data') / 'sentiment.npy'
    np.save(save_to, sentiments)
    print(f"sentiments saved: {save_to}")

    # save ids
-    save_to = data_dir() / 'sentiment_ids.npy'
+    save_to = paths('data') / 'sentiment_ids.npy'
    np.save(save_to, story_ids)
    print(f"ids saved: {save_to}")

--- a/src/data/word.py
+++ b/src/data/word.py
@@ -0,0 +1,93 @@
+import click
+from transformers import AutoTokenizer, RobertaModel
+import numpy as np
+from data.main import connect, paths
+from tqdm import tqdm
+import torch
+from pathlib import Path
+
+@click.command(name="word:max-sequence")
+def max_sequence():
+    """calculate the maximum token length given the story titles"""
+    with connect() as db:
+        longest = db.sql("""
+            select
+                title
+            from stories
+            order by length(title) desc
+            limit 5000
+        """).df()
+
+    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
+    tokens = tokenizer(longest['title'].to_list())
+    print(f"{max([len(x) for x in tokens['input_ids']])}")
+
+@click.command(name="word:embed")
+@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
+@click.option('--embedding_dir', help="path to save embeddings as np array", type=Path, default=Path(paths('data') / 'embeddings'), show_default=True)
+@click.option('--token_dir', help="path to save tokens as np array", type=Path, default=Path(paths('data') / 'tokens'), show_default=True)
+@click.option('--device', help="device to process data on", type=str, default="cuda:0", show_default=True)
+def embed(chunks, embedding_dir, token_dir, device):
+    """ given titles, generate tokens and word embeddings and saves to disk """
+
+    # init models
+    device = torch.device(device)
+    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
+    model = RobertaModel.from_pretrained("roberta-base")
+    model.to(device)
+
+    # load data
+    with connect() as db:
+        table = db.sql("""
+            select
+            title
+            from stories
+            order by id desc
+        """).df()
+
+    # normalize text
+    table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
+
+    # generate embeddings from list of titles
+    chunks = np.array_split(table['title'].to_numpy(), chunks)
+    chunk_iter = tqdm(chunks, 'embedding')
+    for i, chunk in enumerate(chunk_iter):
+        # create tokens, padding to max width 
+        tokens = tokenizer(chunk.tolist(), add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
+        tokens = tokens.to(device)
+        with torch.no_grad():
+            outputs = model(**tokens)
+
+        # to disk
+        hidden = outputs.last_hidden_state.to(torch.device('cpu')).detach().numpy()
+        np.save(embedding_dir / f"embedding_{i}.npy", hidden)
+
+        tokens = tokens.to(torch.device('cpu'))
+        np.save(token_dir / f"token_{i}.npy", tokens)
+
+@click.command(name="word:distance")
+def distance():
+    """TODO: measure distance between sequence embeddings"""
+    distances = distance.cdist(classes, classes, 'euclidean')
+    np.fill_diagonal(distances, np.inf)
+    min_index = (np.argmin(distances))
+    closest = np.unravel_index(min_index, distances.shape)
+    distances.flatten().shape
+
+# path = paths('data') / 'embeddings'
+# chunks = [x for x in path.iterdir() if x.match('*.npy')]
+# chunks = sorted(chunks, key=lambda x: int(x.stem.split('_')[1]))
+# 
+# data = None
+# for i, f in enumerate(tqdm(chunks)):
+#     loaded = np.load(f)
+#     if data is None:
+#         data = loaded
+#     else:
+#         data = np.concatenate([data, loaded])
+#     if i > 20:
+#         break
+# 
+# data.shape
+# 
+# np.save(data, paths('data') / 'embeddings.npy')