finish presentation.

2023-05-18 19:55:15 -07:00 · 2023-05-18 19:55:15 -07:00 · 398228f02c
parent 3f7b3ad467
commit 398228f02c
18 changed files with 975 additions and 184 deletions
--- a/docs/figures/bias_hist.png
+++ b/docs/figures/bias_hist.png
--- a/docs/figures/bias_publisher_hist.png
+++ b/docs/figures/bias_publisher_hist.png
--- a/docs/figures/bias_vs_recent_winner.png
+++ b/docs/figures/bias_vs_recent_winner.png
--- a/docs/figures/bias_vs_sentiment_over_time.png
+++ b/docs/figures/bias_vs_sentiment_over_time.png
--- a/docs/figures/link_confusion.png
+++ b/docs/figures/link_confusion.png
--- a/docs/figures/raw_bias_table.png
+++ b/docs/figures/raw_bias_table.png
--- a/docs/figures/raw_emotion_table.png
+++ b/docs/figures/raw_emotion_table.png
--- a/docs/figures/raw_related_table.png
+++ b/docs/figures/raw_related_table.png
--- a/docs/figures/raw_sentiment_table.png
+++ b/docs/figures/raw_sentiment_table.png
--- a/docs/figures/raw_stories_table.png
+++ b/docs/figures/raw_stories_table.png
--- a/docs/figures/selected_bias_table.png
+++ b/docs/figures/selected_bias_table.png
--- a/docs/presentation.md
+++ b/docs/presentation.md
--- a/src/cli.py
+++ b/src/cli.py
@ -67,6 +67,7 @@ if __name__ == "__main__":

    cli.add_command(plots.sentiment.over_time)
    cli.add_command(plots.sentiment.bias_over_time)
+    cli.add_command(plots.sentiment.bias_vs_recent_winner)


    cli()
--- a/src/data/scrape.py
+++ b/src/data/scrape.py
@ -348,6 +348,7 @@ def create_elections_table():
            row_number() over() as id
            ,type
            ,date
+            ,winner
        FROM df
    """)

@ -359,6 +360,7 @@ def create_elections_table():
                ,e.id as election_id
                ,e.date as election_date
                ,s.published_at as publish_date
+                ,e.winner as winner
            FROM (
                SELECT
                    DISTINCT
@ -373,6 +375,7 @@ def create_elections_table():
                ,publish_date
                ,election_date
                ,election_id
+                ,winner
            FROM cte
        )
        SELECT
@ -380,6 +383,7 @@ def create_elections_table():
            ,publish_date
            ,election_date
            ,election_id
+            ,winner
        FROM windowed
        WHERE rn = 1
    """)
--- a/src/plots/bias.py
+++ b/src/plots/bias.py
@ -0,0 +1,60 @@
+import click
+from data.main import connect
+import os
+from pathlib import Path
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
+
+@click.command('plot:bias-hist')
+def hist():
+    filename = "bias_hist.png"
+
+    DB = connect()
+    data = DB.sql("""
+        SELECT
+            b.ordinal
+            ,count(1) as stories
+        FROM stories s
+        JOIN publisher_bias pb
+        ON pb.publisher_id = s.publisher_id
+        JOIN bias_ratings b
+        ON b.id = pb.bias_id
+        GROUP BY
+            b.ordinal
+    """).df()
+    DB.close()
+
+    ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
+    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
+    ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
+    plt.tight_layout()
+    plt.savefig(out_path / filename)
+    print(f"saved: {filename}")
+@click.command('plot:bias-publisher-hist')
+def publisher_hist():
+    filename = "bias_publisher_hist.png"
+
+    DB = connect()
+    data = DB.sql("""
+        SELECT
+            b.ordinal
+            ,count(1) as publishers
+        FROM publisher_bias pb
+        JOIN bias_ratings b
+        ON b.id = pb.bias_id
+        GROUP BY
+            b.ordinal
+    """).df()
+    DB.close()
+
+    ax = sns.barplot(x=data['ordinal'], y=data['publishers'], color='tab:blue')
+    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
+    ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels)
+    plt.tight_layout()
+    plt.savefig(out_path / filename)
+    plt.close()
+    print(f"saved: {filename}")
--- a/src/plots/emotion.py
+++ b/src/plots/emotion.py
@ -115,3 +115,45 @@ def emotion_regression():
    plt.tight_layout()
    plt.savefig(out_path / filename)
    print(f"saved: {filename}")
+
+@click.command('plot:emotion-hist')
+def emotion_hist():
+    filename = "emotion_hist.png"
+
+    DB = connect()
+    DB.query("""describe story_emotions""")
+
+    DB.query("""
+        select
+            e.label
+            ,count(distinct s.id) as stories
+            ,count(distinct s.publisher_id) as publishers
+        from story_emotions se
+        join emotions e
+        on e.id = se.emotion_id
+        join top.stories s
+        on s.id = se.story_id
+        group by
+            e.label
+    """).df().to_markdown(index=False)
+
+    data = DB.sql("""
+        SELECT
+            b.ordinal
+            ,count(1) as stories
+        FROM stories s
+        JOIN publisher_bias pb
+        ON pb.publisher_id = s.publisher_id
+        JOIN bias_ratings b
+        ON b.id = pb.bias_id
+        GROUP BY
+            b.ordinal
+    """).df()
+    DB.close()
+
+    ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
+    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
+    ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
+    plt.tight_layout()
+    plt.savefig(out_path / filename)
+    print(f"saved: {filename}")
--- a/src/plots/links.py
+++ b/src/plots/links.py
@ -112,3 +112,134 @@ def test():
    # .df().to_csv(data_dir / 'cluster_publishers.csv', sep="|", index=False)
    DB.close()

+
+@click.command('plot:link-confusion')
+def link_confusion():
+    from sklearn.model_selection import train_test_split
+    from sklearn.neighbors import KNeighborsClassifier
+    from sklearn.metrics import ConfusionMatrixDisplay
+
+    filename = "link_confusion.png"
+
+    DB = connect()
+    bias = DB.query("""
+        SELECT
+            p.id as publisher_id
+            ,b.ordinal
+        FROM top.publishers p
+        JOIN top.publisher_bias pb
+        ON pb.publisher_id = p.id
+        JOIN bias_ratings b
+        ON b.id = pb.bias_id
+    """).df()
+
+    df = DB.query("""
+        SELECT
+            *
+        FROM top.link_edges
+        WHERE parent_id in (
+            select
+                publisher_id
+            from bias
+        )
+        AND child_id in (
+            select
+                publisher_id
+            from bias
+        )
+    """).df()
+    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
+
+    x = pivot.values
+    y = bias.sort_values('publisher_id').ordinal
+
+
+    x_train, x_test = train_test_split(x)
+    y_train, y_test = train_test_split(y)
+
+    model = KNeighborsClassifier(n_neighbors=5)
+    model.fit(x_train, y_train)
+    y_pred = model.predict(x_test)
+
+
+    fig, ax = plt.subplots(figsize=(10, 5))
+    ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
+    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
+    ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
+    plt.savefig(out_dir / filename)
+    plt.close()
+    print(f"saved plot: {filename}")
+
+@click.command('plot:link-classifier')
+def link_confusion():
+    from sklearn.model_selection import train_test_split
+    from sklearn.neighbors import KNeighborsClassifier
+    from sklearn.metrics import ConfusionMatrixDisplay
+
+    filename = "link_confusion.png"
+
+    DB = connect()
+    bias = DB.query("""
+        SELECT
+            p.id as publisher_id
+            ,b.ordinal
+        FROM top.publishers p
+        JOIN top.publisher_bias pb
+        ON pb.publisher_id = p.id
+        JOIN bias_ratings b
+        ON b.id = pb.bias_id
+    """).df()
+
+    df = DB.query("""
+        SELECT
+            *
+        FROM top.link_edges
+        WHERE parent_id in (
+            select
+                publisher_id
+            from bias
+        )
+        AND child_id in (
+            select
+                publisher_id
+            from bias
+        )
+    """).df()
+    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
+
+    x = pivot.values
+    y = bias.sort_values('publisher_id').ordinal
+
+    data = DB.query(f"""
+        SELECT
+            p.id as publisher_id
+            ,pca.first
+            ,pca.second
+        FROM top.publisher_pca_onehot pca
+        JOIN top.publishers p
+        ON pca.publisher_id = p.id
+    """).df()
+
+
+
+    model = KNeighborsClassifier(n_neighbors=5)
+    model.fit(x, y)
+    y_pred = model.predict(x)
+
+    plot = bias.sort_values('publisher_id')
+    plot['pred'] = y_pred
+    data = pd.merge(plot, data)
+
+
+    fig, ax = plt.subplots(figsize=(10, 5))
+    ConfusionMatrixDisplay.from_predictions(data['ordinal'], data['pred'], ax=ax)
+    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
+    ax.set(title="confusion matrix for link matrix kNN classifier", xticklabels=ticklabels, yticklabels=ticklabels)
+    plt.savefig(out_dir / filename)
+    plt.close()
+    print(f"saved plot: {filename}")
+
+    ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
+    plt.savefig(out_dir / filename)
+    plt.close()
+    print(f"saved plot: {filename}")
--- a/src/plots/sentiment.py
+++ b/src/plots/sentiment.py
@ -8,6 +8,7 @@ import numpy as np
 import pandas as pd

 out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
+
@click.command('plot:sentiment-over-time')
 def over_time():
    filename = "sentiment_over_time.png"
@ -30,6 +31,7 @@ def over_time():
    plt.tight_layout()
    plt.savefig(out_path / filename)
    print(f"saved: {filename}")
+
@click.command('plot:bias-vs-sentiment-over-time')
 def bias_over_time():
    filename = "bias_vs_sentiment_over_time.png"
@ -38,8 +40,9 @@ def bias_over_time():
    data = DB.sql("""
        SELECT
            avg(sent.class_id) as sentiment
-            ,s.published_at as date
-            ,b.id as bias_id
+            ,date_trunc('yearweek', s.published_at) as date
+            --,b.ordinal as ordinal
+            ,b.bias
        FROM top.story_sentiments sent
        JOIN top.stories s
        ON s.id = sent.story_id
@ -48,13 +51,88 @@ def bias_over_time():
        JOIN bias_ratings b
        ON b.id = pb.bias_id
        GROUP BY
-            s.published_at
-            ,b.id
+            date_trunc('yearweek', s.published_at)
+            ,b.bias
    """).df()
    DB.close()

-    ax = sns.relplot(x=data['date'], y=data['sentiment'], col=data['bias_id'])
-    ax.set(title="sentiment vs. time grouped by bias")
+    order = ['left', 'left-center', 'center', 'right-center', 'right']
+    ax = sns.relplot(data, x='date', y='sentiment', col='bias', col_order=order)
+    plt.tight_layout()
+    plt.savefig(out_path / filename)
+    plt.close()
+    print(f"saved: {filename}")
+
+@click.command('plot:sentiment-recent-winner')
+def bias_vs_recent_winner():
+    filename = "bias_vs_recent_winner.png"
+
+    DB = connect()
+    data = DB.sql("""
+        SELECT
+            e.days_away as days_away
+            ,b.ordinal
+            ,avg(sent.class_id) as sentiment
+            ,count(1) as stories
+        FROM top.stories s
+        JOIN top.story_sentiments sent
+        ON s.id = sent.story_id
+        JOIN election_distance e
+        ON e.publish_date = s.published_at
+        JOIN publisher_bias pb
+        ON pb.publisher_id = s.publisher_id
+        JOIN bias_ratings b
+        ON b.id = pb.bias_id
+        GROUP BY
+            e.days_away
+            ,b.ordinal
+    """).df()
+    DB.close()
+    data
+
+    ax = sns.scatterplot(x=data['days_away'], y=data['sentiment'], hue=data['ordinal'])
+    ax.set(title="sentiment vs. days to nearest election", xlabel="days to nearest election", ylabel="average title seniment")
+    plt.tight_layout()
+    plt.savefig(out_path / filename)
+    plt.close()
+
+    print(f"saved: {filename}")
+
+@click.command('plot:sentiment-hist')
+def sentiment_hist():
+    filename = "sentiment_hist.png"
+
+    DB = connect()
+
+    DB.query("""
+        select
+            sent.label
+            ,count(distinct s.id) as stories
+            ,count(distinct s.publisher_id) as publishers
+        from top.story_sentiments sent
+        join top.stories s
+        on s.id = sent.story_id
+        group by
+            sent.label
+    """).df().to_markdown(index=False)
+
+    data = DB.sql("""
+        SELECT
+            b.ordinal
+            ,count(1) as stories
+        FROM stories s
+        JOIN publisher_bias pb
+        ON pb.publisher_id = s.publisher_id
+        JOIN bias_ratings b
+        ON b.id = pb.bias_id
+        GROUP BY
+            b.ordinal
+    """).df()
+    DB.close()
+
+    ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
+    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
+    ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
    plt.tight_layout()
    plt.savefig(out_path / filename)
    print(f"saved: {filename}")