add better normalization. add link similarity.

2023-05-07 22:07:26 -07:00 · 2023-05-07 22:07:26 -07:00 · 4bd9f46edd
parent 3a6f97b290
commit 4bd9f46edd
7 changed files with 383 additions and 29 deletions
--- a/docs/progress.md
+++ b/docs/progress.md
@ -1,5 +1,29 @@
 # Data Mining - CSCI 577

+# Project Status Report IV
+
+*2023-04-25*
+
+This project report will take the form of an initial draft of the final report, making use of the template discussed in class and made available on Canvas. Minimally, this draft should include the following:
+
+1.  Data preparation
+2.  Policy for dealing with missing attribute values
+3.  If your project is one of classification, discuss:
+    a.  Intelligent discretization
+    b.  Identification of useless attributes
+    c.  Policy for violations of the adequacy condition and missing
+        attribute values
+4.  If your project is one of clustering:
+    a.  Elimination of noise attributes
+    b.  Proper choice or development of distance measures
+5.  If your project is one of association rule analysis:
+    a.  What are the "market baskets"?
+    b.  How are thresholds for support and confidence developed.
+6.  In all cases, you should specify:
+    a.  What computational experiments you have conducted, or plan to
+        conduct.
+
+
 # Project Status Report III

 *2023-04-18*
@ -35,6 +59,10 @@ I will use the following suite of python tools to conduct my research:
 > This progress should also provide a definitive description of your purpose and how you intend to conduct it.
 > This should take the form of a detailed outline of the procedures you will undertake in exploring your dataset(s) and maximizing the knowledge that can be extracted from it.

+The ultimate purpose of the project is track the progress of political discourse as a function of time and publisher.
+Using a dataset of article titles and publications, the aim of the project is to classify article titles using a sentiment analysis language model.
+ 
+
 \newpage 

 # Project Status Report II
--- a/src/bias.py
+++ b/src/bias.py
@ -7,14 +7,16 @@ import os
 import csv

 def map(rating:str) -> int:
+
    mapping = {
-        'right' : 0,
+        'left' : 0,
        'left-center' : 1,
        'center' : 2,
-        'left' : 3,
-        'allsides' : 4,
-        'right-center' : 5
+        'right-center' : 3,
+        'right' : 4,
+        'allsides' : -1,
    }
+
    return mapping[rating]


@ -35,13 +37,39 @@ def load() -> None:
 def normalize() -> None:
    DB = connect()

+    DB.sql("""
+        CREATE OR REPLACE TABLE publisher_bias AS
+        WITH cte AS (
+            SELECT
+                p.id
+                ,b.bias as label
+                ,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity
+            FROM bias_ratings b
+            JOIN publishers p
+            ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95
+        ),ranked AS (
+            SELECT
+                id
+                ,label
+                ,similarity
+                ,ROW_NUMBER() OVER(PARTITION BY id ORDER BY similarity DESC) AS rn
+            FROM cte
+        )
+        SELECT
+            id
+            ,label
+        FROM ranked
+        WHERE ranked.rn = 1
+    """)
+
+
    DB.sql("""
        with cte as (
            select 
-                s.publisher
+                s.publisher_id
                ,count(1) as stories
            from stories s
-            group by s.publisher
+            group by s.publisher_id
        )
        select
            s.publisher
--- a/src/cli.py
+++ b/src/cli.py
@ -1,10 +1,12 @@
 import click
+from dotenv import load_dotenv

@click.group()
 def cli():
    ...

 if __name__ == "__main__":
+    load_dotenv()
    import scrape
    cli.add_command(scrape.download)
    cli.add_command(scrape.parse)
--- a/src/emotion.py
+++ b/src/emotion.py
@ -9,6 +9,8 @@ from model import BertForMultiLabelClassification
 from data import connect
 import seaborn as sns
 import matplotlib.pyplot as plt
+from matplotlib.dates import DateFormatter
+import matplotlib.dates as mdates

 def data():
    # load data
@ -126,24 +128,153 @@ def normalize():
    """)
    DB.close()

+@click.command("emotion:analyze")
+def coef_over_time():
+    """plot and group emotional labels"""
+    DB = connect()
+
+    emotions = DB.sql("""
+        select label from emotions
+    """).df()
+
+    from sklearn import linear_model
+    from sklearn.model_selection import train_test_split
+
+    def results(buckets = '1 month'):
+        results = DB.sql(f"""
+            with cte as (
+                SELECT
+                    time_bucket(interval '{buckets}', s.published_at) as date
+                    ,e.label
+                    ,COUNT(1) AS stories
+                FROM stories s
+                JOIN story_emotions se
+                ON s.id = se.story_id
+                JOIN emotions e
+                ON e.id = se.emotion_id
+                WHERE YEAR(s.published_at) < 2022
+                GROUP BY 
+                    time_bucket(interval '{buckets}', s.published_at)
+                    ,e.label
+            )
+            ,total as (
+                SELECT
+                    time_bucket(interval '{buckets}', s.published_at) as date
+                    ,COUNT(1) AS stories
+                FROM stories s
+                WHERE YEAR(s.published_at) < 2022
+                GROUP BY 
+                    time_bucket(interval '{buckets}', s.published_at)
+            )
+            select
+                epoch(cte.date) / 60 / 60 / 24 / 365 as date
+                ,cte.label
+                ,cast(cte.stories as float) / t.stories as stories
+            from cte
+            join total t
+            on t.date = cte.date
+        """).df()
+        return results
+
+
+    def get_coef(label):
+        reg = linear_model.LinearRegression()
+        df = results[results['label'] == label]
+        x = df['date'].to_numpy().reshape(-1, 1)
+        y = df['stories']
+        x_train, x_test = train_test_split(x)
+        y_train, y_test = train_test_split(y)
+        reg.fit(x_train, y_train)
+        # y_pred = reg.predict(x_test)
+        # sns.lineplot(x=x_test.flatten(), y=y_pred)
+        return reg.coef_
+
+    collection = []
+    results = results('2 year')
+    for emotion in emotions['label']:
+        if emotion == 'neutral':
+            continue
+        coef = get_coef(emotion)[0]
+        if coef > 0:
+            increasing = True
+        else:
+            increasing = False
+        collection.append({'emotion' : emotion, 'coef' : coef, 'increasing' : coef > 0 })
+    pd.DataFrame(collection).sort_values('coef')
+
+    plt.show()
+
@click.command("emotion:analyze")
 def analyze():
    """plot and group emotional labels"""
    DB = connect()
-    DB.sql("""
-        WITH grouped as (
+
+    emotions = DB.sql("""
+        select label from emotions
+    """).df()
+
+    from sklearn import linear_model
+    from sklearn.model_selection import train_test_split
+    def get_coef(emotion):
+        df = DB.sql("""
+            with cte as (
                SELECT
-                YEAR(s.published_at) as year
+                    time_bucket(interval '1 month', s.published_at) as date
                    ,e.label
                    ,COUNT(1) AS stories
-            FROM story_emotions e
-            JOIN stories s
-            ON s.id = e.story_id
+                FROM stories s
+                JOIN story_emotions se
+                ON s.id = se.story_id
+                JOIN emotions e
+                ON e.id = se.emotion_id
                WHERE YEAR(s.published_at) < 2022
-            AND label = 'annoyance'
+                --AND e.label in ('neutral', 'annoyance')
+                AND e.label in ('sadness')
                GROUP BY 
-                YEAR(s.published_at)
+                    time_bucket(interval '1 month', s.published_at)
                    ,e.label
+            )
+            ,total as (
+                SELECT
+                    time_bucket(interval '1 month', s.published_at) as date
+                    ,COUNT(1) AS stories
+                FROM stories s
+                WHERE YEAR(s.published_at) < 2022
+                GROUP BY 
+                    time_bucket(interval '1 month', s.published_at)
+            )
+            select
+                epoch(cte.date) as date
+                ,cte.label
+                --,total.stories as total
+                ,cast(cte.stories as float) / e.stories as stories
+            from cte
+            join emotions e
+            --on total.date = cte.date
+            on e.label = cte.label
+        """).df()
+
+        reg = linear_model.LinearRegression()
+        x = df['date'].to_numpy().reshape(-1, 1)
+        y = df['stories']
+
+        x_train, x_test = train_test_split(x)
+        y_train, y_test = train_test_split(y)
+        reg.fit(x_train, y_train)
+        #y_pred = reg.predict(x_test)
+        return reg.coef_
+
+
+    df = DB.sql(f"""{yearly}""").df()
+    df['date'] = pd.to_datetime(df['date'])
+    ax = sns.lineplot(x=df['date'], y=df['stories'], hue=df['label'])
+    #ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
+    plt.locator_params(axis='y', nbins=6)
+    ax.xaxis.set_major_formatter(DateFormatter("%m-%y"))
+    plt.show()
+
+    DB.sql("""
+        WITH grouped as (
        ), total AS (
            SELECT
                e.label
--- a/src/links.py
+++ b/src/links.py
@ -0,0 +1,111 @@
+from data import connect
+import pandas as pd
+import numpy as np
+from sklearn.decomposition import PCA, TruncatedSVD
+from sklearn.cluster import MiniBatchKMeans
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+
+
+def to_matrix():
+    """returns an adjacency matrix of publishers to publisher link frequency"""
+
+    DB = connect()
+
+    bias_map = pd.DataFrame([
+        {'label' :'left', 'value' : 0},
+        {'label' :'left-center', 'value' : 1},
+        {'label' :'center', 'value' : 2},
+        {'label' :'right-center', 'value' : 3},
+        {'label' :'right', 'value' : 4},
+        {'label' :'allsides', 'value' : -1},
+    ])
+    bias = DB.sql("""
+        SELECT
+            b.id
+            ,b.label
+            ,m.value
+        FROM publisher_bias b
+        JOIN bias_map m
+        ON b.label = m.label
+        WHERE value != -1
+    """).df()
+
+    pub = DB.sql("""
+            select 
+                p.id
+                ,p.name
+                ,p.url
+                ,b.label
+                ,b.value
+            from publishers p
+            left join bias b
+            on b.id = p.id
+    """).df()
+
+    edges = DB.sql("""
+        WITH total as (
+            SELECT
+                s.publisher_id as id
+                ,COUNT(1) as stories
+            FROM stories s
+            GROUP BY 
+                s.publisher_id
+        ), p as (
+            SELECT
+                p.id
+                ,stories
+            FROM publishers p
+            LEFT JOIN total t
+            ON t.id = p.id
+            WHERE t.stories >= 20
+        ), cte as (
+            SELECT 
+                r.publisher_id as child_id
+                ,s.publisher_id as parent_id
+                ,count(1) as links
+            FROM related_stories r
+            JOIN stories s
+            ON s.id = r.parent_id
+            group by 
+                s.publisher_id
+                ,r.publisher_id
+        )
+        SELECT
+            p.id as parent_id
+            ,cte.child_id
+            ,links
+        FROM p
+        left JOIN cte
+        ON p.id = cte.parent_id
+    """).df()
+
+    adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
+
+    
+    out = pd.DataFrame(adj.index.values, columns=['id'])
+    out = pd.merge(out, pub, how='left', on='id')
+
+    pca = PCA(n_components=4)
+    pca_out = pca.fit_transform(adj)
+
+    svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
+    svd_out = svd.fit_transform(adj)
+
+    x = svd_out[:, 0]
+    y = svd_out[:, 1]
+
+    x = pca_out[:, 0]
+    y = pca_out[:, 1]
+    sns.scatterplot(x=x, y=y)
+    plt.show()
+
+    kmeans = MiniBatchKMeans(n_clusters=5, random_state=0, batch_size=6, n_init="auto")
+    pred = kmeans.fit_predict(pca_out)
+
+    sns.scatterplot(x=x, y=y, hue=pred)
+    plt.show()
+
+    sns.scatterplot(x=pub['first'], y=pub['second'], hue=pub['bias'])
+    plt.show()
--- a/src/scrape.py
+++ b/src/scrape.py
@ -7,6 +7,7 @@ from tqdm import tqdm
 from data import data_dir, connect
 from lxml import etree
 import pandas as pd
+from urllib.parse import urlparse

@click.command(name='scrape:load')
@click.option('--directory', type=Path, default=data_dir(), show_default=True)
@ -103,12 +104,14 @@ def parse(directory, output_dir):

            url = item.xpath('.//strong/a')[0].get('href')
            out['url'] = url
+            out['publisher_url_domain'] = urlparse(publisher_url).netloc
+            out['domain'] = urlparse(url).netloc

            item_id = hash((page.stem, url))
            out['id'] = item_id

-            old_id = hash((title, page.stem, publisher_url))
-            out['old_id'] = old_id
+            # old_id = hash((title, page.stem, publisher_url))
+            # out['old_id'] = old_id
            published.append(out)

            related = item.xpath(".//span[contains(@class, 'mls')]/a")
@ -118,6 +121,7 @@ def parse(directory, output_dir):
                another['url'] = relation.get('href')
                another['publisher'] = relation.text
                another['parent_id'] = item_id
+                another['publisher_domain'] = urlparse(another['url']).netloc
                others.append(another)
    df = pd.DataFrame(published)
    df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
@ -128,6 +132,7 @@ def parse(directory, output_dir):
 def normalize():
    """fix database after load. remove duplicates. create publishers."""
    DB = connect()
+
    DB.sql("""
        DELETE FROM stories
        WHERE id IN (
@ -146,29 +151,77 @@ def normalize():
            OR title_ctn > 1
        )
    """)
+
+
+
    DB.sql("""
        CREATE OR REPLACE TABLE publishers AS
        with cte as (
            SELECT
-                s.publisher
-                ,s.publisher_url
+                s.publisher as name
+                ,s.publisher_url_domain as url
            FROM stories s
            GROUP BY 
                s.publisher
-                ,s.publisher_url
+                ,s.publisher_url_domain
        ), together AS (
            SELECT
-                COALESCE(cte.publisher, r.publisher) AS publisher
-                ,cte.publisher_url
+                COALESCE(cte.name, r.publisher) AS name
+                ,COALESCE(cte.url, r.publisher_domain) as url
            FROM cte
            FULL OUTER JOIN related_stories r
-            ON cte.publisher = r.publisher
+            ON cte.url = r.publisher_domain
        )
        SELECT
            ROW_NUMBER() OVER() as id
-            ,t.*
+            ,t.name
+            ,t.url
        FROM together t
+        where t.url is not null
        GROUP BY
-            publisher
-            ,publisher_url
+            name
+            ,url
    """)
+
+    DB.sql("""
+        alter table stories
+        add column publisher_id bigint
+    """)
+
+    DB.sql("""
+        update stories
+        set publisher_id = publishers.id
+        from publishers
+        where publishers.url = stories.publisher_url_domain
+    """)
+
+    DB.sql("""
+        alter table stories alter publisher_id set data type bigint
+    """)
+
+
+    DB.sql("""
+        alter table stories drop publisher;
+        alter table stories drop publisher_url;
+        alter table stories drop publisher_url_domain;
+        alter table stories drop domain;
+    """)
+
+    DB.sql("""
+        alter table related_stories
+        add column publisher_id bigint
+    """)
+
+
+    DB.sql("""
+        update related_stories
+        set publisher_id = publishers.id
+        from publishers
+        where publishers.url = related_stories.publisher_domain
+    """)
+
+    DB.sql("""
+        alter table related_stories drop publisher;
+        alter table related_stories drop publisher_domain;
+    """)
+
--- a/src/word.py
+++ b/src/word.py
@ -81,3 +81,4 @@ def distance():
    min_index = (np.argmin(distances))
    closest = np.unravel_index(min_index, distances.shape)
    distances.flatten().shape
+    DB.close()