add exp 4/5

2023-05-17 21:38:21 -07:00
parent 74c2d8afa2
commit 3f7b3ad467
16 changed files with 905 additions and 59 deletions
--- a/src/cli.py
+++ b/src/cli.py
@@ -12,6 +12,8 @@ if __name__ == "__main__":
    cli.add_command(scrape.parse)
    cli.add_command(scrape.load)
    cli.add_command(scrape.normalize)
+    cli.add_command(scrape.create_elections_table)
+
    import word
    # cli.add_command(word.distance)
    # cli.add_command(word.train)
@@ -30,8 +32,11 @@ if __name__ == "__main__":
    cli.add_command(emotion.normalize)
    cli.add_command(emotion.analyze)
    cli.add_command(emotion.create_table)
+
    import sentence
    cli.add_command(sentence.embed)
+    cli.add_command(sentence.create_avg_pca_table)
+
    from train import main as train_main
    cli.add_command(train_main.main)

@@ -54,4 +59,14 @@ if __name__ == "__main__":
    import  plots.classifier as plotc
    cli.add_command(plotc.pca_with_classes)

+    import plots
+    cli.add_command(plots.sentence.sentence_pca)
+    cli.add_command(plots.sentence.avg_sentence_pca)
+    cli.add_command(plots.emotion.emotion_over_time)
+    cli.add_command(plots.emotion.emotion_regression)
+
+    cli.add_command(plots.sentiment.over_time)
+    cli.add_command(plots.sentiment.bias_over_time)
+
+
    cli()
--- a/src/data/scrape.py
+++ b/src/data/scrape.py
@@ -335,3 +335,92 @@ def another_norm():
        on sv2.id = s.id
        limit 5
    """)
+
+@click.command('data:create-election-table')
+def create_elections_table():
+    df = pd.read_csv(data_dir() / 'election_dates.csv', sep="|")
+    df['date'] = pd.to_datetime(df.date)
+
+    DB = connect()
+    DB.query("""
+        CREATE OR REPLACE TABLE election_dates AS
+        SELECT
+            row_number() over() as id
+            ,type
+            ,date
+        FROM df
+    """)
+
+    DB.query("""
+        CREATE OR REPLACE TABLE election_distance AS
+        WITH cte as (
+            SELECT
+                day(e.date - s.published_at) as days_away
+                ,e.id as election_id
+                ,e.date as election_date
+                ,s.published_at as publish_date
+            FROM (
+                SELECT
+                    DISTINCT
+                    published_at
+                FROM top.stories
+            ) s
+            CROSS JOIN election_dates e
+        ) , windowed as (
+            SELECT
+                row_number() over(partition by publish_date order by abs(days_away) asc) as rn
+                ,days_away
+                ,publish_date
+                ,election_date
+                ,election_id
+            FROM cte
+        )
+        SELECT
+            days_away
+            ,publish_date
+            ,election_date
+            ,election_id
+        FROM windowed
+        WHERE rn = 1
+    """)
+
+    DB.close()
+
+@click.command('scrape:create-denorm')
+def create_denorm():
+
+    DB = connect()
+    DB.sql("create schema denorm")
+    DB.sql("""
+        CREATE OR REPLACE TABLE denorm.stories AS
+        SELECT
+            s.id as story_id
+            ,s.title
+            ,s.url
+            ,s.published_at
+            ,s.author
+            ,p.name as publisher
+            ,p.tld as tld
+            ,sent.class_id as sentiment
+            ,d.days_away as election_distance
+            ,b.ordinal as bias
+            ,pca.first as link_1
+            ,pca.second as link_2
+            ,e.emotion_id as emotion
+        FROM top.stories s
+        JOIN top.publishers p
+        ON p.id = s.publisher_id
+        JOIN top.story_sentiments sent
+        ON s.id = sent.story_id
+        JOIN election_distance d
+        ON d.election_date = s.published_at
+        JOIN publisher_bias pb
+        ON pb.publisher_id = p.id
+        JOIN bias_ratings b
+        ON b.id = pb.bias_id
+        JOIN top.publisher_pca_onehot pca
+        ON pca.publisher_id = p.id
+        JOIN story_emotions e
+        ON e.story_id = s.id
+    """)
+    DB.close()
--- a/src/emotion.py
+++ b/src/emotion.py
@@ -379,24 +379,34 @@ def debug():

 def another():
    DB = connect()
+
    DB.sql("""
        select
            *
        from emotions
    """)

-    emotions = DB.sql("""
+    DB.sql("""
        select
-            year(s.published_at) as year
-            ,se.label as emotion
-            ,count(1) as stories
-        from stories s
-        join story_emotions se
-        on s.id = se.story_id
-        group by
-            year(s.published_at)
-            ,se.label
+            *
+        from story_emotions
+    """)
+
+    emotions = DB.sql("""
+        SELECT
+            YEAR(s.published_at) AS year
+            ,e.label AS emotion
+            ,count(1) AS stories
+        FROM stories s
+        JOIN story_emotions se
+        ON s.id = se.story_id
+        JOIN emotions e
+        ON e.id = se.emotion_id
+        GROUP by
+            YEAR(s.published_at)
+            ,e.label
    """).df()
+    emotions

    sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
    plt.show()
--- a/src/plots/init.py
+++ b/src/plots/init.py
@@ -0,0 +1,9 @@
+import plots.sentence
+import plots.emotion
+import plots.sentiment
+
+__all__ = [
+    'sentence'
+    'emotion',
+    'sentiment',
+]
--- a/src/plots/emotion.py
+++ b/src/plots/emotion.py
@@ -0,0 +1,117 @@
+import click
+from data.main import connect
+import os
+from pathlib import Path
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
+
+@click.command('plot:emotion-over-time')
+def emotion_over_time():
+    filename = "emotion_over_time.png"
+    DB = connect()
+
+    emotions = DB.sql("""
+        SELECT
+            date_trunc('year', s.published_at) AS year
+            ,e.label AS emotion
+            ,count(1) AS stories
+        FROM top.stories s
+        JOIN story_emotions se
+        ON s.id = se.story_id
+        JOIN emotions e
+        ON e.id = se.emotion_id
+        GROUP by
+            date_trunc('year', s.published_at)
+            ,e.label
+    """).df()
+    DB.close()
+
+    ax = sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
+    ax.set(title="title emotions over years", xlabel="year", ylabel="stories (#)")
+    plt.savefig(out_path / filename)
+    print(f"saved: {filename}")
+
+@click.command('plot:emotion-regression')
+def emotion_regression():
+    from sklearn import linear_model
+    from sklearn.model_selection import train_test_split
+    from sklearn.metrics import ConfusionMatrixDisplay
+
+    filename = "emotion_regression.png"
+
+
+    DB = connect()
+    emotions = DB.query("""
+        SELECT
+        label
+        FROM emotions e
+    """).df()['label'].to_list()
+    DB.close()
+
+    DB = connect()
+    df = DB.sql(f"""
+        SELECT
+            epoch(date_trunc('yearweek', s.published_at)) AS date
+            ,e.id AS emotion_id
+            ,p.id as publisher_id
+            ,count(1) AS stories
+        FROM top.stories s
+        JOIN top.publishers p
+        ON p.id = s.publisher_id
+        JOIN story_emotions se
+        ON s.id = se.story_id
+        JOIN emotions e
+        ON e.id = se.emotion_id
+        GROUP by
+            epoch(date_trunc('yearweek', s.published_at))
+            ,p.id
+            ,e.id
+    """).df()
+    DB.close()
+
+    results = []
+    for (emotion_id, publisher_id), group in df.groupby(['emotion_id', 'publisher_id']):
+        model = linear_model.LinearRegression()
+        x = group['date'].to_numpy().reshape(-1, 1)
+        y = group['stories'].to_numpy()
+        model.fit(x, y)
+        per_year = model.coef_.item() * 60 * 60 * 24 * 365
+        results.append({'emotion_id' : emotion_id, 'publisher_id':publisher_id, 'per_year' : per_year})
+    results = pd.DataFrame(results)
+
+    DB = connect()
+    out = DB.query("""
+        SELECT
+            e.label as emotion
+            --,p.tld
+            ,avg(results.per_year) as avg_reg_coef
+            ,b.ordinal
+        FROM results
+        JOIN emotions e
+        ON e.id = results.emotion_id
+        JOIN top.publishers p
+        ON p.id = results.publisher_id
+        JOIN publisher_bias pb
+        ON pb.publisher_id = results.publisher_id
+        JOIN bias_ratings b
+        ON b.id = pb.bias_id
+        GROUP BY
+            e.label
+            ,b.ordinal
+    """).df()
+    DB.close()
+    pivot = out.pivot(index=['emotion'], columns=['ordinal'], values=['avg_reg_coef'])
+
+    ax = sns.heatmap(pivot, cmap='RdBu_r')
+    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
+    ax.set(title="slope of regression (stories/year) by bias and emotion"
+           ,xticklabels=ticklabels
+           ,xlabel="bias"
+           ,ylabel="emotion")
+    plt.tight_layout()
+    plt.savefig(out_path / filename)
+    print(f"saved: {filename}")
--- a/src/plots/sentence.py
+++ b/src/plots/sentence.py
@@ -0,0 +1,111 @@
+import click
+from data.main import connect
+import os
+from pathlib import Path
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
+data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
+
+@click.command('plot:sentence-pca')
+def sentence_pca():
+    filename = "embedding_sentence_pca.png"
+    DB = connect()
+
+    data = DB.query("""
+        SELECT
+            pca.first
+            ,pca.second
+            ,b.bias as label
+        FROM top.story_embeddings_pca pca
+        JOIN top.stories s
+        ON s.id = pca.story_id
+        JOIN top.publisher_bias pb
+        ON pb.publisher_id = s.publisher_id
+        JOIN bias_ratings b
+        ON b.id = pb.bias_id
+    """).df()
+    DB.close()
+
+    ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
+    ax.set(title="pca components vs. bias label", xlabel="first component", ylabel="second component")
+    plt.savefig(out_path / filename)
+
+@click.command('plot:avg-sentence-pca')
+def avg_sentence_pca():
+    filename = "avg_embedding_sentence_pca.png"
+    DB = connect()
+
+    data = DB.query("""
+        SELECT
+            pca.first
+            ,pca.second
+            ,p.tld
+            ,b.bias as label
+        FROM top.publisher_embeddings_pca pca
+        JOIN top.publishers p
+        ON p.id = pca.publisher_id
+        JOIN top.publisher_bias pb
+        ON pb.publisher_id = p.id
+        JOIN bias_ratings b
+        ON b.id = pb.bias_id
+    """).df()
+    DB.close()
+
+    ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
+    ax.set(title="avg. publisher embedding pca components vs. bias label", xlabel="first component", ylabel="second component")
+    plt.savefig(out_path / filename)
+
+@click.command('plot:sentence-confusion')
+def sentence_confusion():
+    from sklearn.model_selection import train_test_split
+    from sklearn.neighbors import KNeighborsClassifier
+    from sklearn.metrics import ConfusionMatrixDisplay
+
+    filename = "sentence_confusion.png"
+
+    embeddings = np.load(data_path / 'embeddings.npy')
+    embedding_ids = np.load(data_path / 'embedding_ids.npy')
+    ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
+
+    DB = connect()
+    data = DB.query("""
+        SELECT
+            ids.index
+            ,s.id
+            ,b.ordinal
+        FROM ids
+        JOIN top.stories s
+        ON ids.story_id = s.id
+        JOIN top.publisher_bias pb
+        ON pb.publisher_id = s.publisher_id
+        JOIN bias_ratings b
+        ON b.id = pb.bias_id
+    """).df()
+    pub = DB.query("""
+        SELECT
+            *
+        FROM top.publishers
+    """).df()
+    DB.close()
+
+    train, test = train_test_split(data)
+    train_x, train_y = embeddings[train['index']], train['ordinal']
+    test_x, test_y = embeddings[test['index']], test['ordinal']
+
+    model = KNeighborsClassifier(n_neighbors=5)
+    model.fit(train_x, train_y)
+    pred = model.predict(test_x)
+
+
+    fig, ax = plt.subplots(figsize=(10, 5))
+    ConfusionMatrixDisplay.from_predictions(test_y, pred, ax=ax)
+    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
+    ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
+    plt.savefig(out_path / filename)
+    plt.close()
+
+    print(f"saved plot: {filename}")
--- a/src/plots/sentiment.py
+++ b/src/plots/sentiment.py
@@ -0,0 +1,60 @@
+import click
+from data.main import connect
+import os
+from pathlib import Path
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
+@click.command('plot:sentiment-over-time')
+def over_time():
+    filename = "sentiment_over_time.png"
+
+    DB = connect()
+    data = DB.sql("""
+        SELECT
+            avg(sent.class_id) as sentiment
+            ,s.published_at as date
+        FROM top.story_sentiments sent
+        JOIN top.stories s
+        ON s.id = sent.story_id
+        GROUP BY
+            s.published_at
+    """).df()
+    DB.close()
+
+    ax = sns.scatterplot(x=data['date'], y=data['sentiment'])
+    ax.set(title="sentiment vs. time")
+    plt.tight_layout()
+    plt.savefig(out_path / filename)
+    print(f"saved: {filename}")
+@click.command('plot:bias-vs-sentiment-over-time')
+def bias_over_time():
+    filename = "bias_vs_sentiment_over_time.png"
+
+    DB = connect()
+    data = DB.sql("""
+        SELECT
+            avg(sent.class_id) as sentiment
+            ,s.published_at as date
+            ,b.id as bias_id
+        FROM top.story_sentiments sent
+        JOIN top.stories s
+        ON s.id = sent.story_id
+        JOIN publisher_bias pb
+        ON pb.publisher_id = s.publisher_id
+        JOIN bias_ratings b
+        ON b.id = pb.bias_id
+        GROUP BY
+            s.published_at
+            ,b.id
+    """).df()
+    DB.close()
+
+    ax = sns.relplot(x=data['date'], y=data['sentiment'], col=data['bias_id'])
+    ax.set(title="sentiment vs. time grouped by bias")
+    plt.tight_layout()
+    plt.savefig(out_path / filename)
+    print(f"saved: {filename}")
--- a/src/sentence.py
+++ b/src/sentence.py
@@ -72,16 +72,71 @@ def embed(chunks):
    print(f"ids saved: {save_to}")


-@click.command('sentence:create-pca-table')
-def create_table():
-    from sklearn import linear_model
+@click.command('sentence:create-avg-pca-table')
+def create_avg_pca_table():
+    from sklearn.decomposition import PCA
    data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))

+
    embeddings = np.load(data_path / 'embeddings.npy')
    embedding_ids = np.load(data_path / 'embedding_ids.npy')
    ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
-    DB = connect()

+    DB = connect()
+    data = DB.query("""
+        SELECT
+            ids.index
+            ,s.id
+            ,s.publisher_id
+            ,b.ordinal
+        FROM ids
+        JOIN top.stories s
+        ON ids.story_id = s.id
+        JOIN top.publisher_bias pb
+        ON pb.publisher_id = s.publisher_id
+        JOIN bias_ratings b
+        ON b.id = pb.bias_id
+    """).df()
+    DB.close()
+
+    results = []
+    for publisher_id, group in data.groupby(['publisher_id']):
+        avg = embeddings[group['index']].mean(axis=0)
+        ordinal = group['ordinal'].iloc[0]
+        results.append({'publisher_id' : publisher_id, 'embedding' : avg, 'ordinal' : ordinal})
+    results = pd.DataFrame(results)
+
+    x = np.stack(results['embedding'])
+    y = results['ordinal']
+
+    model = PCA(n_components=2)
+    pred = model.fit_transform(x)
+    results['first'] = pred[:, 0]
+    results['second'] = pred[:, 1]
+
+    table_name = "top.publisher_embeddings_pca"
+    DB = connect()
+    DB.query(f"""
+        CREATE OR REPLACE TABLE {table_name} AS
+        SELECT
+            results.publisher_id as publisher_id
+            ,results.first as first
+            ,results.second as second
+        FROM results
+    """)
+    DB.close()
+    print(f"created {table_name}")
+
+
+@click.command('sentence:create-pca-table')
+def create_pca_table():
+    from sklearn.decomposition import PCA
+    data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
+
+    embeddings = np.load(data_path / 'embeddings.npy')
+    embedding_ids = np.load(data_path / 'embedding_ids.npy')
+
+    DB = connect()
    data = DB.query("""
        SELECT
            ids.index
@@ -95,19 +150,38 @@ def create_table():
        JOIN bias_ratings b
        ON b.id = pb.bias_id
    """).df()
+    pub = DB.query("""
+        SELECT
+            *
+        FROM top.publishers
+    """).df()
+    DB.close()

    x = embeddings[data['index']]
    y = data['ordinal'].to_numpy().reshape(-1, 1)
+    model = PCA(n_components=2)
+    pred = model.fit_transform(x)
+    data['first'] = pred[:, 0]
+    data['second'] = pred[:, 1]

-    reg = linear_model.LinearRegression()
+    table_name = f"top.story_embeddings_pca"

-    reg.fit(x, y)
-
-    reg.coef_.shape
+    DB = connect()
+    DB.query(f"""
+        CREATE OR REPLACE TABLE {table_name} AS
+        SELECT
+            data.id as story_id
+            ,data.first as first
+            ,data.second as second
+        FROM data
+    """)
+    DB.close()
+    print(f"created {table_name}")

@click.command('sentence:create-svm-table')
 def create_svm_table():
    from sklearn import svm
+    from sklearn.linear_model import SGDClassifier
    data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))

    embeddings = np.load(data_path / 'embeddings.npy')
@@ -133,6 +207,8 @@ def create_svm_table():
    #y = data['ordinal'].to_numpy().reshape(-1, 1)
    y = data['ordinal']

-    clf = svm.SVC()
-    pred = clf.fit(x, y)
+    model = SGDClassifier()
+    pred = model.fit(x, y)
+    data['pred'] = pred.predict(x)
+    data

--- a/src/sentiment.py
+++ b/src/sentiment.py
@@ -0,0 +1,86 @@
+from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
+import torch
+import torch.nn.functional as F
+from data import connect, data_dir
+import numpy as np
+from tqdm import tqdm
+import click
+
+@click.option('-c', '--chunks', type=int, default=500, show_default=True)
+@click.command("sentiment:extract")
+def extract(chunks):
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    chunks = 1000
+
+    # Load model from HuggingFace Hub
+    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+    model = model.to(device)
+
+
+    # load data
+    DB = connect()
+    table = DB.sql("""
+        select
+        id
+        ,title
+        from stories
+        order by id desc
+    """).df()
+    DB.close()
+
+    # normalize text
+    table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
+
+
+    chunked = np.array_split(table, chunks)
+
+    # generate embeddings from list of titles
+    iterator = tqdm(chunked, 'embedding')
+    sentiments = []
+    story_ids = []
+    for _, chunk in enumerate(iterator):
+        sentences = chunk['title'].tolist()
+        ids = chunk['id'].tolist()
+        # Tokenize sentences
+        encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+        # Compute token embeddings
+        with torch.no_grad():
+            logits = model(**encoded_input.to(device)).logits
+        sentiment = logits.argmax(axis=1).tolist()
+        sentiments.append(sentiment)
+        story_ids.append(ids)
+
+    sentiments = np.concatenate(sentiments)
+    story_ids = np.concatenate(story_ids)
+
+    # save embeddings
+    save_to = data_dir() / 'sentiment.npy'
+    np.save(save_to, sentiments)
+    print(f"sentiments saved: {save_to}")
+
+    # save ids
+    save_to = data_dir() / 'sentiment_ids.npy'
+    np.save(save_to, story_ids)
+    print(f"ids saved: {save_to}")
+
+@click.command('sentiment:load')
+def load():
+
+    DB = connect()
+    sentiments = np.load(data_dir() / 'sentiment.npy')
+    story_ids = np.load(data_dir() / 'sentiment_ids.npy')
+    data = pd.DataFrame(story_ids, columns=['story_id']).reset_index()
+    data['sentiment_id'] = sentiments
+
+    DB.query("""
+        CREATE OR REPLACE TABLE top.story_sentiments AS
+        SELECT
+            data.story_id
+            ,data.sentiment_id as class_id
+            ,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label
+        FROM data
+        JOIN top.stories s
+        ON s.id = data.story_id
+    """)
+    DB.close()