add better normalization. add link similarity.

2023-05-07 22:07:26 -07:00
parent 3a6f97b290
commit 4bd9f46edd
7 changed files with 383 additions and 29 deletions
--- a/src/bias.py
+++ b/src/bias.py
@@ -7,14 +7,16 @@ import os
 import csv

 def map(rating:str) -> int:
+
    mapping = {
-        'right' : 0,
+        'left' : 0,
        'left-center' : 1,
        'center' : 2,
-        'left' : 3,
-        'allsides' : 4,
-        'right-center' : 5
+        'right-center' : 3,
+        'right' : 4,
+        'allsides' : -1,
    }
+
    return mapping[rating]


@@ -35,13 +37,39 @@ def load() -> None:
 def normalize() -> None:
    DB = connect()

+    DB.sql("""
+        CREATE OR REPLACE TABLE publisher_bias AS
+        WITH cte AS (
+            SELECT
+                p.id
+                ,b.bias as label
+                ,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity
+            FROM bias_ratings b
+            JOIN publishers p
+            ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95
+        ),ranked AS (
+            SELECT
+                id
+                ,label
+                ,similarity
+                ,ROW_NUMBER() OVER(PARTITION BY id ORDER BY similarity DESC) AS rn
+            FROM cte
+        )
+        SELECT
+            id
+            ,label
+        FROM ranked
+        WHERE ranked.rn = 1
+    """)
+
+
    DB.sql("""
        with cte as (
            select 
-                s.publisher
+                s.publisher_id
                ,count(1) as stories
            from stories s
-            group by s.publisher
+            group by s.publisher_id
        )
        select
            s.publisher
--- a/src/cli.py
+++ b/src/cli.py
@@ -1,10 +1,12 @@
 import click
+from dotenv import load_dotenv

@click.group()
 def cli():
    ...

 if __name__ == "__main__":
+    load_dotenv()
    import scrape
    cli.add_command(scrape.download)
    cli.add_command(scrape.parse)
--- a/src/emotion.py
+++ b/src/emotion.py
@@ -9,6 +9,8 @@ from model import BertForMultiLabelClassification
 from data import connect
 import seaborn as sns
 import matplotlib.pyplot as plt
+from matplotlib.dates import DateFormatter
+import matplotlib.dates as mdates

 def data():
    # load data
@@ -126,24 +128,153 @@ def normalize():
    """)
    DB.close()

+@click.command("emotion:analyze")
+def coef_over_time():
+    """plot and group emotional labels"""
+    DB = connect()
+
+    emotions = DB.sql("""
+        select label from emotions
+    """).df()
+
+    from sklearn import linear_model
+    from sklearn.model_selection import train_test_split
+
+    def results(buckets = '1 month'):
+        results = DB.sql(f"""
+            with cte as (
+                SELECT
+                    time_bucket(interval '{buckets}', s.published_at) as date
+                    ,e.label
+                    ,COUNT(1) AS stories
+                FROM stories s
+                JOIN story_emotions se
+                ON s.id = se.story_id
+                JOIN emotions e
+                ON e.id = se.emotion_id
+                WHERE YEAR(s.published_at) < 2022
+                GROUP BY 
+                    time_bucket(interval '{buckets}', s.published_at)
+                    ,e.label
+            )
+            ,total as (
+                SELECT
+                    time_bucket(interval '{buckets}', s.published_at) as date
+                    ,COUNT(1) AS stories
+                FROM stories s
+                WHERE YEAR(s.published_at) < 2022
+                GROUP BY 
+                    time_bucket(interval '{buckets}', s.published_at)
+            )
+            select
+                epoch(cte.date) / 60 / 60 / 24 / 365 as date
+                ,cte.label
+                ,cast(cte.stories as float) / t.stories as stories
+            from cte
+            join total t
+            on t.date = cte.date
+        """).df()
+        return results
+
+
+    def get_coef(label):
+        reg = linear_model.LinearRegression()
+        df = results[results['label'] == label]
+        x = df['date'].to_numpy().reshape(-1, 1)
+        y = df['stories']
+        x_train, x_test = train_test_split(x)
+        y_train, y_test = train_test_split(y)
+        reg.fit(x_train, y_train)
+        # y_pred = reg.predict(x_test)
+        # sns.lineplot(x=x_test.flatten(), y=y_pred)
+        return reg.coef_
+
+    collection = []
+    results = results('2 year')
+    for emotion in emotions['label']:
+        if emotion == 'neutral':
+            continue
+        coef = get_coef(emotion)[0]
+        if coef > 0:
+            increasing = True
+        else:
+            increasing = False
+        collection.append({'emotion' : emotion, 'coef' : coef, 'increasing' : coef > 0 })
+    pd.DataFrame(collection).sort_values('coef')
+
+    plt.show()
+
@click.command("emotion:analyze")
 def analyze():
    """plot and group emotional labels"""
    DB = connect()
+
+    emotions = DB.sql("""
+        select label from emotions
+    """).df()
+
+    from sklearn import linear_model
+    from sklearn.model_selection import train_test_split
+    def get_coef(emotion):
+        df = DB.sql("""
+            with cte as (
+                SELECT
+                    time_bucket(interval '1 month', s.published_at) as date
+                    ,e.label
+                    ,COUNT(1) AS stories
+                FROM stories s
+                JOIN story_emotions se
+                ON s.id = se.story_id
+                JOIN emotions e
+                ON e.id = se.emotion_id
+                WHERE YEAR(s.published_at) < 2022
+                --AND e.label in ('neutral', 'annoyance')
+                AND e.label in ('sadness')
+                GROUP BY 
+                    time_bucket(interval '1 month', s.published_at)
+                    ,e.label
+            )
+            ,total as (
+                SELECT
+                    time_bucket(interval '1 month', s.published_at) as date
+                    ,COUNT(1) AS stories
+                FROM stories s
+                WHERE YEAR(s.published_at) < 2022
+                GROUP BY 
+                    time_bucket(interval '1 month', s.published_at)
+            )
+            select
+                epoch(cte.date) as date
+                ,cte.label
+                --,total.stories as total
+                ,cast(cte.stories as float) / e.stories as stories
+            from cte
+            join emotions e
+            --on total.date = cte.date
+            on e.label = cte.label
+        """).df()
+
+        reg = linear_model.LinearRegression()
+        x = df['date'].to_numpy().reshape(-1, 1)
+        y = df['stories']
+
+        x_train, x_test = train_test_split(x)
+        y_train, y_test = train_test_split(y)
+        reg.fit(x_train, y_train)
+        #y_pred = reg.predict(x_test)
+        return reg.coef_
+
+
+    df = DB.sql(f"""{yearly}""").df()
+    df['date'] = pd.to_datetime(df['date'])
+    ax = sns.lineplot(x=df['date'], y=df['stories'], hue=df['label'])
+    #ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
+    plt.locator_params(axis='y', nbins=6)
+    ax.xaxis.set_major_formatter(DateFormatter("%m-%y"))
+    plt.show()
+
    DB.sql("""
        WITH grouped as (
-            SELECT
-                YEAR(s.published_at) as year
-                ,e.label
-                ,COUNT(1) AS stories
-            FROM story_emotions e
-            JOIN stories s
-            ON s.id = e.story_id
-            WHERE YEAR(s.published_at) < 2022
-            AND label = 'annoyance'
-            GROUP BY 
-                YEAR(s.published_at)
-                ,e.label
        ), total AS (
            SELECT
                e.label
--- a/src/links.py
+++ b/src/links.py
@@ -0,0 +1,111 @@
+from data import connect
+import pandas as pd
+import numpy as np
+from sklearn.decomposition import PCA, TruncatedSVD
+from sklearn.cluster import MiniBatchKMeans
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+
+
+def to_matrix():
+    """returns an adjacency matrix of publishers to publisher link frequency"""
+
+    DB = connect()
+
+    bias_map = pd.DataFrame([
+        {'label' :'left', 'value' : 0},
+        {'label' :'left-center', 'value' : 1},
+        {'label' :'center', 'value' : 2},
+        {'label' :'right-center', 'value' : 3},
+        {'label' :'right', 'value' : 4},
+        {'label' :'allsides', 'value' : -1},
+    ])
+    bias = DB.sql("""
+        SELECT
+            b.id
+            ,b.label
+            ,m.value
+        FROM publisher_bias b
+        JOIN bias_map m
+        ON b.label = m.label
+        WHERE value != -1
+    """).df()
+
+    pub = DB.sql("""
+            select 
+                p.id
+                ,p.name
+                ,p.url
+                ,b.label
+                ,b.value
+            from publishers p
+            left join bias b
+            on b.id = p.id
+    """).df()
+
+    edges = DB.sql("""
+        WITH total as (
+            SELECT
+                s.publisher_id as id
+                ,COUNT(1) as stories
+            FROM stories s
+            GROUP BY 
+                s.publisher_id
+        ), p as (
+            SELECT
+                p.id
+                ,stories
+            FROM publishers p
+            LEFT JOIN total t
+            ON t.id = p.id
+            WHERE t.stories >= 20
+        ), cte as (
+            SELECT 
+                r.publisher_id as child_id
+                ,s.publisher_id as parent_id
+                ,count(1) as links
+            FROM related_stories r
+            JOIN stories s
+            ON s.id = r.parent_id
+            group by 
+                s.publisher_id
+                ,r.publisher_id
+        )
+        SELECT
+            p.id as parent_id
+            ,cte.child_id
+            ,links
+        FROM p
+        left JOIN cte
+        ON p.id = cte.parent_id
+    """).df()
+
+    adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
+
+    
+    out = pd.DataFrame(adj.index.values, columns=['id'])
+    out = pd.merge(out, pub, how='left', on='id')
+
+    pca = PCA(n_components=4)
+    pca_out = pca.fit_transform(adj)
+
+    svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
+    svd_out = svd.fit_transform(adj)
+
+    x = svd_out[:, 0]
+    y = svd_out[:, 1]
+
+    x = pca_out[:, 0]
+    y = pca_out[:, 1]
+    sns.scatterplot(x=x, y=y)
+    plt.show()
+
+    kmeans = MiniBatchKMeans(n_clusters=5, random_state=0, batch_size=6, n_init="auto")
+    pred = kmeans.fit_predict(pca_out)
+
+    sns.scatterplot(x=x, y=y, hue=pred)
+    plt.show()
+
+    sns.scatterplot(x=pub['first'], y=pub['second'], hue=pub['bias'])
+    plt.show()
--- a/src/scrape.py
+++ b/src/scrape.py
@@ -7,6 +7,7 @@ from tqdm import tqdm
 from data import data_dir, connect
 from lxml import etree
 import pandas as pd
+from urllib.parse import urlparse

@click.command(name='scrape:load')
@click.option('--directory', type=Path, default=data_dir(), show_default=True)
@@ -103,12 +104,14 @@ def parse(directory, output_dir):

            url = item.xpath('.//strong/a')[0].get('href')
            out['url'] = url
+            out['publisher_url_domain'] = urlparse(publisher_url).netloc
+            out['domain'] = urlparse(url).netloc

            item_id = hash((page.stem, url))
            out['id'] = item_id

-            old_id = hash((title, page.stem, publisher_url))
-            out['old_id'] = old_id
+            # old_id = hash((title, page.stem, publisher_url))
+            # out['old_id'] = old_id
            published.append(out)

            related = item.xpath(".//span[contains(@class, 'mls')]/a")
@@ -118,6 +121,7 @@ def parse(directory, output_dir):
                another['url'] = relation.get('href')
                another['publisher'] = relation.text
                another['parent_id'] = item_id
+                another['publisher_domain'] = urlparse(another['url']).netloc
                others.append(another)
    df = pd.DataFrame(published)
    df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
@@ -128,6 +132,7 @@ def parse(directory, output_dir):
 def normalize():
    """fix database after load. remove duplicates. create publishers."""
    DB = connect()
+
    DB.sql("""
        DELETE FROM stories
        WHERE id IN (
@@ -146,29 +151,77 @@ def normalize():
            OR title_ctn > 1
        )
    """)
+
+
+
    DB.sql("""
        CREATE OR REPLACE TABLE publishers AS
        with cte as (
            SELECT
-                s.publisher
-                ,s.publisher_url
+                s.publisher as name
+                ,s.publisher_url_domain as url
            FROM stories s
            GROUP BY 
                s.publisher
-                ,s.publisher_url
+                ,s.publisher_url_domain
        ), together AS (
            SELECT
-                COALESCE(cte.publisher, r.publisher) AS publisher
-                ,cte.publisher_url
+                COALESCE(cte.name, r.publisher) AS name
+                ,COALESCE(cte.url, r.publisher_domain) as url
            FROM cte
            FULL OUTER JOIN related_stories r
-            ON cte.publisher = r.publisher
+            ON cte.url = r.publisher_domain
        )
        SELECT
            ROW_NUMBER() OVER() as id
-            ,t.*
+            ,t.name
+            ,t.url
        FROM together t
+        where t.url is not null
        GROUP BY
-            publisher
-            ,publisher_url
+            name
+            ,url
    """)
+
+    DB.sql("""
+        alter table stories
+        add column publisher_id bigint
+    """)
+
+    DB.sql("""
+        update stories
+        set publisher_id = publishers.id
+        from publishers
+        where publishers.url = stories.publisher_url_domain
+    """)
+
+    DB.sql("""
+        alter table stories alter publisher_id set data type bigint
+    """)
+
+
+    DB.sql("""
+        alter table stories drop publisher;
+        alter table stories drop publisher_url;
+        alter table stories drop publisher_url_domain;
+        alter table stories drop domain;
+    """)
+
+    DB.sql("""
+        alter table related_stories
+        add column publisher_id bigint
+    """)
+
+
+    DB.sql("""
+        update related_stories
+        set publisher_id = publishers.id
+        from publishers
+        where publishers.url = related_stories.publisher_domain
+    """)
+
+    DB.sql("""
+        alter table related_stories drop publisher;
+        alter table related_stories drop publisher_domain;
+    """)
+
--- a/src/word.py
+++ b/src/word.py
@@ -81,3 +81,4 @@ def distance():
    min_index = (np.argmin(distances))
    closest = np.unravel_index(min_index, distances.shape)
    distances.flatten().shape
+    DB.close()