finish paper

2023-12-15 09:34:18 -08:00
parent a21ed7a7d9
commit f069a37ca6
19 changed files with 547 additions and 315 deletions
--- a/src/data/factcheck.py
+++ b/src/data/factcheck.py
@@ -8,7 +8,7 @@ from pathlib import Path
 import os
 import sys
 import click
-from data.main import connect, map_tld, paths, reporting_label_to_int
+from data.main import connect, map_tld, paths, reporting_label_to_int, bias_label_to_int
 from random import randint
 from time import sleep
 from tqdm import tqdm
@@ -128,6 +128,8 @@ def create_tables():
    df['tld'] = df.tld.apply(map_tld)
    df['ordinal'] = df.bias.apply(bias_label_to_int)

+    df.tld
+
    with connect() as db:
        db.sql("""
            CREATE OR REPLACE TABLE mbfc.publishers AS
--- a/src/data/links.py
+++ b/src/data/links.py
@@ -1,6 +1,8 @@
 import click
-from data.main import connect
+from data.main import connect, paths, ticklabels
 import pandas as pd
+from sklearn.decomposition import PCA
+from sklearn.cluster import KMeans

@click.command('links:create-table')
 def create_table():
@@ -53,7 +55,6 @@ def create_table():
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
 def create_pca(source):
    """create 2D pca labels"""
-    from sklearn.decomposition import PCA

    table_name = f"publisher_pca_{source}"

@@ -62,8 +63,6 @@ def create_pca(source):
            SELECT
                p.*
            FROM mbfc.publishers p
-            JOIN mbfc.publisher_stories ps
-            ON p.id = ps.publisher_id
        """).df()
        df = db.query(f"""
            SELECT
@@ -98,9 +97,10 @@ def create_pca(source):
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
 def create_clusters(source):
    """create link adj. matrix clusters table"""
-    from sklearn.cluster import KMeans

+    source = 'links'
    table_name = f"publisher_clusters_{source}"
+
    with connect() as db:
        df = db.query(f"""
            SELECT
@@ -113,17 +113,18 @@ def create_clusters(source):
            SELECT
                p.*
            FROM mbfc.publishers p
-            JOIN mbfc.publisher_stories ps
-            ON ps.publisher_id = p.id
        """).df()
+
+
    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
-    k = 8
+    k = 5
    kmeans = KMeans(n_clusters=k, n_init="auto")
    pred = kmeans.fit_predict(pivot)
    out = pivot.reset_index()[['parent_id']]
    out['label'] = pred
    out = pd.merge(out, pub, left_on='parent_id', right_on='id')
    new_table = out[['id', 'label']]
+
    with connect() as db:
        db.query(f"""
            CREATE OR REPLACE TABLE {table_name} AS
@@ -132,4 +133,5 @@ def create_clusters(source):
                ,n.label as label
            FROM new_table n
        """)
+
    print(f"created {table_name}")
--- a/src/plots/descriptive.py
+++ b/src/plots/descriptive.py
@@ -11,7 +11,7 @@ def articles_per_year():
    save_to = paths('figures') / 'articles_per_year.png'

    with connect() as db:
-        data = DB.query("""
+        data = db.query("""
            select
                year(published_at) as year
                ,count(1) as stories
@@ -27,6 +27,40 @@ def articles_per_year():
    plt.savefig(save_to)
    print(f"saved: {save_to}")

+@click.command('descriptive:articles-per-bias-per-year')
+def articles_per_bias_per_year():
+
+    save_to = paths('figures') / 'articles_per_bias_per_year.png'
+
+    with connect() as db:
+        data = db.query("""
+            select
+                date_trunc('year', s.published_at) as year
+                ,p.bias
+                ,count(1) as stories
+            from stories s
+            join mbfc.publisher_stories ps
+            on ps.story_id = s.id
+            join mbfc.publishers p
+            on p.id = ps.publisher_id
+            where year(s.published_at) not in (2005, 2023)
+            and p.bias ilike 'L%'
+            or p.bias ilike 'R%'
+            group by 
+                date_trunc('year', s.published_at)
+                ,p.bias
+            order by mode(p.ordinal)
+        """).df()
+
+    fig, ax = plt.subplots(figsize=(5, 5))
+    sns.lineplot(x=data.year, y=data.stories, hue=data.bias, ax=ax, palette='rainbow')
+    ax.tick_params(axis='x', rotation=90)
+    ax.set(ylabel="count of stories (#)")
+    plt.legend(loc='upper right')
+    plt.tight_layout()
+    plt.savefig(save_to)
+    print(f"saved: {save_to}")
+
@click.command('descriptive:distinct-publishers')
 def distinct_publishers():
    save_to = paths('figures') / 'distinct_publishers.png'
--- a/src/plots/links.py
+++ b/src/plots/links.py
@@ -3,13 +3,14 @@ from data.main import connect, ticklabels, paths
 import seaborn as sns
 import matplotlib.pyplot as plt
 import numpy as np
-from sklearn.metrics import silhouette_score
 import pandas as pd
-
+from sklearn.cluster import KMeans
+from sklearn.model_selection import train_test_split
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.metrics import ConfusionMatrixDisplay, silhouette_score

@click.command('links:elbow')
 def elbow():
-    from sklearn.cluster import KMeans

    save_to = paths('figures') / 'link_cluster_elbow.png'

@@ -32,7 +33,7 @@ def elbow():
    to_plot = pd.DataFrame(to_plot)

    ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia)
-    ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
+    ax.set(xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
    plt.savefig(save_to)
    plt.close()
    print(f"saved plot: {save_to}")
@@ -43,40 +44,31 @@ def elbow():
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
 def link_pca_clusters(source):

+    source = 'onehot'
    save_to = paths('figures') / f"link_pca_clusters_{source}.png"

    with connect() as db:
        df = db.query(f"""
            SELECT
-                c.label as cluster
-                ,p.tld
-                --,b.label as bias
-                ,pca.first
+                pca.first
                ,pca.second
-                ,s.cnt as stories
-            FROM publisher_clusters_{source} c
-            JOIN mbfc.publisher_stories ps
-            ON ps.publisher_id = c.publisher_id
-            JOIN mbfc.publishers p
-            ON ps.publisher_id = p.id
-            JOIN 
-            (
-                select
-                    p.id as publisher_id
-                    ,count(1) as cnt
-                FROM mbfc.publishers p
-                GROUP BY
-                    p.id
-            ) s
-            ON s.publisher_id = p.id
+                ,pca.publisher_id
+                ,p.ordinal as bias
+                ,p.name
+                ,clusters.label as cluster
+                ,count(1) over() as cnt
+            FROM mbfc.publishers p
            JOIN publisher_pca_{source} pca
-            ON pca.publisher_id = p.id
+            ON p.id = pca.publisher_id
+            JOIN publisher_clusters_{source} clusters
+            ON p.id = clusters.publisher_id
        """).df()

-    ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster'])
+    ax = sns.scatterplot(df, x='first', y='second', hue='cluster')
    ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component")
    plt.savefig(save_to)
    print(f"saved plot: {save_to}")
+    plt.close()


 def test():
@@ -108,9 +100,6 @@ def test():

@click.command('links:confusion')
 def link_confusion():
-    from sklearn.model_selection import train_test_split
-    from sklearn.neighbors import KNeighborsClassifier
-    from sklearn.metrics import ConfusionMatrixDisplay

    filename = "link_confusion.png"
    save_to = paths('figures') / filename
@@ -119,28 +108,13 @@ def link_confusion():
        bias = db.query("""
            SELECT
                p.id as publisher_id
-                ,b.ordinal
-            FROM top.publishers p
-            JOIN top.publisher_bias pb
-            ON pb.publisher_id = p.id
-            JOIN bias_ratings b
-            ON b.id = pb.bias_id
+                ,p.ordinal
+            FROM mbfc.publishers p
        """).df()
-
        df = db.query("""
            SELECT
                *
-            FROM top.link_edges
-            WHERE parent_id in (
-                select
-                    publisher_id
-                from bias
-            )
-            AND child_id in (
-                select
-                    publisher_id
-                from bias
-            )
+            FROM link_edges
        """).df()

    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
@@ -167,9 +141,6 @@ def link_confusion():

@click.command('links:classifier')
 def link_confusion():
-    from sklearn.model_selection import train_test_split
-    from sklearn.neighbors import KNeighborsClassifier
-    from sklearn.metrics import ConfusionMatrixDisplay

    save_to = paths('figures') / "link_confusion.png"

@@ -204,15 +175,20 @@ def link_confusion():
    x = publisher_matrix.loc[:, ~publisher_matrix.columns.isin(['publisher_id', 'ordinal'])].values
    y = publisher_matrix['ordinal']

+    x_train, x_test = train_test_split(x)
+    y_train, y_test = train_test_split(y)
+
    model = KNeighborsClassifier(n_neighbors=5)
-    model.fit(x, y)
-    y_pred = model.predict(x)
+    model.fit(x_train, y_train)
+    y_pred = model.predict(x_test)
+
    publisher_matrix['pred'] = y_pred
    publisher_matrix


    fig, ax = plt.subplots(figsize=(5, 5))
-    ConfusionMatrixDisplay.from_predictions(publisher_matrix['ordinal'], publisher_matrix['pred'], ax=ax)
+    ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
+    ax.legend().remove()
    ax.set(xticklabels=ticklabels(), yticklabels=ticklabels())
    plt.xticks(rotation=45)
    plt.tight_layout()