finish presentation.

2023-05-18 19:55:15 -07:00
parent 3f7b3ad467
commit 398228f02c
18 changed files with 975 additions and 184 deletions
--- a/docs/figures/bias_hist.png
+++ b/docs/figures/bias_hist.png
--- a/docs/figures/bias_publisher_hist.png
+++ b/docs/figures/bias_publisher_hist.png
--- a/docs/figures/bias_vs_recent_winner.png
+++ b/docs/figures/bias_vs_recent_winner.png
--- a/docs/figures/bias_vs_sentiment_over_time.png
+++ b/docs/figures/bias_vs_sentiment_over_time.png
--- a/docs/figures/link_confusion.png
+++ b/docs/figures/link_confusion.png
--- a/docs/figures/raw_bias_table.png
+++ b/docs/figures/raw_bias_table.png
--- a/docs/figures/raw_emotion_table.png
+++ b/docs/figures/raw_emotion_table.png
--- a/docs/figures/raw_related_table.png
+++ b/docs/figures/raw_related_table.png
--- a/docs/figures/raw_sentiment_table.png
+++ b/docs/figures/raw_sentiment_table.png
--- a/docs/figures/raw_stories_table.png
+++ b/docs/figures/raw_stories_table.png
--- a/docs/figures/selected_bias_table.png
+++ b/docs/figures/selected_bias_table.png
--- a/docs/presentation.md
+++ b/docs/presentation.md
--- a/src/cli.py
+++ b/src/cli.py
@@ -67,6 +67,7 @@ if __name__ == "__main__":
    cli.add_command(plots.sentiment.over_time)
    cli.add_command(plots.sentiment.bias_over_time)
    cli.add_command(plots.sentiment.bias_vs_recent_winner)
    cli()
--- a/src/data/scrape.py
+++ b/src/data/scrape.py
@@ -348,6 +348,7 @@ def create_elections_table():
            row_number() over() as id
            ,type
            ,date
            ,winner
        FROM df
    """)
@@ -359,6 +360,7 @@ def create_elections_table():
                ,e.id as election_id
                ,e.date as election_date
                ,s.published_at as publish_date
                ,e.winner as winner
            FROM (
                SELECT
                    DISTINCT
@@ -373,6 +375,7 @@ def create_elections_table():
                ,publish_date
                ,election_date
                ,election_id
                ,winner
            FROM cte
        )
        SELECT
@@ -380,6 +383,7 @@ def create_elections_table():
            ,publish_date
            ,election_date
            ,election_id
            ,winner
        FROM windowed
        WHERE rn = 1
    """)
--- a/src/plots/bias.py
+++ b/src/plots/bias.py
@@ -0,0 +1,60 @@
 import click
 from data.main import connect
 import os
 from pathlib import Path
 import seaborn as sns
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:bias-hist')
 def hist():
    filename = "bias_hist.png"
    DB = connect()
    data = DB.sql("""
        SELECT
            b.ordinal
            ,count(1) as stories
        FROM stories s
        JOIN publisher_bias pb
        ON pb.publisher_id = s.publisher_id
        JOIN bias_ratings b
        ON b.id = pb.bias_id
        GROUP BY
            b.ordinal
    """).df()
    DB.close()
    ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
    ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
    plt.tight_layout()
    plt.savefig(out_path / filename)
    print(f"saved: {filename}")
@click.command('plot:bias-publisher-hist')
 def publisher_hist():
    filename = "bias_publisher_hist.png"
    DB = connect()
    data = DB.sql("""
        SELECT
            b.ordinal
            ,count(1) as publishers
        FROM publisher_bias pb
        JOIN bias_ratings b
        ON b.id = pb.bias_id
        GROUP BY
            b.ordinal
    """).df()
    DB.close()
    ax = sns.barplot(x=data['ordinal'], y=data['publishers'], color='tab:blue')
    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
    ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels)
    plt.tight_layout()
    plt.savefig(out_path / filename)
    plt.close()
    print(f"saved: {filename}")
--- a/src/plots/emotion.py
+++ b/src/plots/emotion.py
@@ -115,3 +115,45 @@ def emotion_regression():
    plt.tight_layout()
    plt.savefig(out_path / filename)
    print(f"saved: {filename}")
@click.command('plot:emotion-hist')
 def emotion_hist():
    filename = "emotion_hist.png"
    DB = connect()
    DB.query("""describe story_emotions""")
    DB.query("""
        select
            e.label
            ,count(distinct s.id) as stories
            ,count(distinct s.publisher_id) as publishers
        from story_emotions se
        join emotions e
        on e.id = se.emotion_id
        join top.stories s
        on s.id = se.story_id
        group by
            e.label
    """).df().to_markdown(index=False)
    data = DB.sql("""
        SELECT
            b.ordinal
            ,count(1) as stories
        FROM stories s
        JOIN publisher_bias pb
        ON pb.publisher_id = s.publisher_id
        JOIN bias_ratings b
        ON b.id = pb.bias_id
        GROUP BY
            b.ordinal
    """).df()
    DB.close()
    ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
    ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
    plt.tight_layout()
    plt.savefig(out_path / filename)
    print(f"saved: {filename}")
--- a/src/plots/links.py
+++ b/src/plots/links.py
@@ -112,3 +112,134 @@ def test():
    # .df().to_csv(data_dir / 'cluster_publishers.csv', sep="|", index=False)
    DB.close()
@click.command('plot:link-confusion')
 def link_confusion():
    from sklearn.model_selection import train_test_split
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.metrics import ConfusionMatrixDisplay
    filename = "link_confusion.png"
    DB = connect()
    bias = DB.query("""
        SELECT
            p.id as publisher_id
            ,b.ordinal
        FROM top.publishers p
        JOIN top.publisher_bias pb
        ON pb.publisher_id = p.id
        JOIN bias_ratings b
        ON b.id = pb.bias_id
    """).df()
    df = DB.query("""
        SELECT
            *
        FROM top.link_edges
        WHERE parent_id in (
            select
                publisher_id
            from bias
        )
        AND child_id in (
            select
                publisher_id
            from bias
        )
    """).df()
    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
    x = pivot.values
    y = bias.sort_values('publisher_id').ordinal
    x_train, x_test = train_test_split(x)
    y_train, y_test = train_test_split(y)
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    fig, ax = plt.subplots(figsize=(10, 5))
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
    ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
    plt.savefig(out_dir / filename)
    plt.close()
    print(f"saved plot: {filename}")
@click.command('plot:link-classifier')
 def link_confusion():
    from sklearn.model_selection import train_test_split
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.metrics import ConfusionMatrixDisplay
    filename = "link_confusion.png"
    DB = connect()
    bias = DB.query("""
        SELECT
            p.id as publisher_id
            ,b.ordinal
        FROM top.publishers p
        JOIN top.publisher_bias pb
        ON pb.publisher_id = p.id
        JOIN bias_ratings b
        ON b.id = pb.bias_id
    """).df()
    df = DB.query("""
        SELECT
            *
        FROM top.link_edges
        WHERE parent_id in (
            select
                publisher_id
            from bias
        )
        AND child_id in (
            select
                publisher_id
            from bias
        )
    """).df()
    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
    x = pivot.values
    y = bias.sort_values('publisher_id').ordinal
    data = DB.query(f"""
        SELECT
            p.id as publisher_id
            ,pca.first
            ,pca.second
        FROM top.publisher_pca_onehot pca
        JOIN top.publishers p
        ON pca.publisher_id = p.id
    """).df()
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(x, y)
    y_pred = model.predict(x)
    plot = bias.sort_values('publisher_id')
    plot['pred'] = y_pred
    data = pd.merge(plot, data)
    fig, ax = plt.subplots(figsize=(10, 5))
    ConfusionMatrixDisplay.from_predictions(data['ordinal'], data['pred'], ax=ax)
    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
    ax.set(title="confusion matrix for link matrix kNN classifier", xticklabels=ticklabels, yticklabels=ticklabels)
    plt.savefig(out_dir / filename)
    plt.close()
    print(f"saved plot: {filename}")
    ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
    plt.savefig(out_dir / filename)
    plt.close()
    print(f"saved plot: {filename}")
--- a/src/plots/sentiment.py
+++ b/src/plots/sentiment.py
@@ -8,6 +8,7 @@ import numpy as np
 import pandas as pd
 out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:sentiment-over-time')
 def over_time():
    filename = "sentiment_over_time.png"
@@ -30,6 +31,7 @@ def over_time():
    plt.tight_layout()
    plt.savefig(out_path / filename)
    print(f"saved: {filename}")
@click.command('plot:bias-vs-sentiment-over-time')
 def bias_over_time():
    filename = "bias_vs_sentiment_over_time.png"
@@ -38,8 +40,9 @@ def bias_over_time():
    data = DB.sql("""
        SELECT
            avg(sent.class_id) as sentiment
-            ,s.published_at as date
+            ,date_trunc('yearweek', s.published_at) as date
-            ,b.id as bias_id
+            --,b.ordinal as ordinal
            ,b.bias
        FROM top.story_sentiments sent
        JOIN top.stories s
        ON s.id = sent.story_id
@@ -48,13 +51,88 @@ def bias_over_time():
        JOIN bias_ratings b
        ON b.id = pb.bias_id
        GROUP BY
-            s.published_at
+            date_trunc('yearweek', s.published_at)
-            ,b.id
+            ,b.bias
    """).df()
    DB.close()
-    ax = sns.relplot(x=data['date'], y=data['sentiment'], col=data['bias_id'])
+    order = ['left', 'left-center', 'center', 'right-center', 'right']
-    ax.set(title="sentiment vs. time grouped by bias")
+    ax = sns.relplot(data, x='date', y='sentiment', col='bias', col_order=order)
    plt.tight_layout()
    plt.savefig(out_path / filename)
    plt.close()
    print(f"saved: {filename}")
@click.command('plot:sentiment-recent-winner')
 def bias_vs_recent_winner():
    filename = "bias_vs_recent_winner.png"
    DB = connect()
    data = DB.sql("""
        SELECT
            e.days_away as days_away
            ,b.ordinal
            ,avg(sent.class_id) as sentiment
            ,count(1) as stories
        FROM top.stories s
        JOIN top.story_sentiments sent
        ON s.id = sent.story_id
        JOIN election_distance e
        ON e.publish_date = s.published_at
        JOIN publisher_bias pb
        ON pb.publisher_id = s.publisher_id
        JOIN bias_ratings b
        ON b.id = pb.bias_id
        GROUP BY
            e.days_away
            ,b.ordinal
    """).df()
    DB.close()
    data
    ax = sns.scatterplot(x=data['days_away'], y=data['sentiment'], hue=data['ordinal'])
    ax.set(title="sentiment vs. days to nearest election", xlabel="days to nearest election", ylabel="average title seniment")
    plt.tight_layout()
    plt.savefig(out_path / filename)
    plt.close()
    print(f"saved: {filename}")
@click.command('plot:sentiment-hist')
 def sentiment_hist():
    filename = "sentiment_hist.png"
    DB = connect()
    DB.query("""
        select
            sent.label
            ,count(distinct s.id) as stories
            ,count(distinct s.publisher_id) as publishers
        from top.story_sentiments sent
        join top.stories s
        on s.id = sent.story_id
        group by
            sent.label
    """).df().to_markdown(index=False)
    data = DB.sql("""
        SELECT
            b.ordinal
            ,count(1) as stories
        FROM stories s
        JOIN publisher_bias pb
        ON pb.publisher_id = s.publisher_id
        JOIN bias_ratings b
        ON b.id = pb.bias_id
        GROUP BY
            b.ordinal
    """).df()
    DB.close()
    ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
    ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
    plt.tight_layout()
    plt.savefig(out_path / filename)
    print(f"saved: {filename}")