add better normalization. add link similarity.

2023-05-07 22:07:26 -07:00 · 2023-05-07 22:07:26 -07:00 · 4bd9f46edd
parent 3a6f97b290
commit 4bd9f46edd
7 changed files with 383 additions and 29 deletions
--- a/docs/progress.md
+++ b/docs/progress.md
@ -1,5 +1,29 @@
 # Data Mining - CSCI 577
 # Project Status Report IV
 *2023-04-25*
 This project report will take the form of an initial draft of the final report, making use of the template discussed in class and made available on Canvas. Minimally, this draft should include the following:
 1.  Data preparation
 2.  Policy for dealing with missing attribute values
 3.  If your project is one of classification, discuss:
    a.  Intelligent discretization
    b.  Identification of useless attributes
    c.  Policy for violations of the adequacy condition and missing
        attribute values
 4.  If your project is one of clustering:
    a.  Elimination of noise attributes
    b.  Proper choice or development of distance measures
 5.  If your project is one of association rule analysis:
    a.  What are the "market baskets"?
    b.  How are thresholds for support and confidence developed.
 6.  In all cases, you should specify:
    a.  What computational experiments you have conducted, or plan to
        conduct.
 # Project Status Report III
 *2023-04-18*
@ -35,6 +59,10 @@ I will use the following suite of python tools to conduct my research:
 > This progress should also provide a definitive description of your purpose and how you intend to conduct it.
 > This should take the form of a detailed outline of the procedures you will undertake in exploring your dataset(s) and maximizing the knowledge that can be extracted from it.
 The ultimate purpose of the project is track the progress of political discourse as a function of time and publisher.
 Using a dataset of article titles and publications, the aim of the project is to classify article titles using a sentiment analysis language model.
 \newpage 
 # Project Status Report II
--- a/src/bias.py
+++ b/src/bias.py
@ -7,14 +7,16 @@ import os
 import csv
 def map(rating:str) -> int:
    mapping = {
-        'right' : 0,
+        'left' : 0,
        'left-center' : 1,
        'center' : 2,
-        'left' : 3,
+        'right-center' : 3,
-        'allsides' : 4,
+        'right' : 4,
-        'right-center' : 5
+        'allsides' : -1,
    }
    return mapping[rating]
@ -35,13 +37,39 @@ def load() -> None:
 def normalize() -> None:
    DB = connect()
    DB.sql("""
        CREATE OR REPLACE TABLE publisher_bias AS
        WITH cte AS (
            SELECT
                p.id
                ,b.bias as label
                ,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity
            FROM bias_ratings b
            JOIN publishers p
            ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95
        ),ranked AS (
            SELECT
                id
                ,label
                ,similarity
                ,ROW_NUMBER() OVER(PARTITION BY id ORDER BY similarity DESC) AS rn
            FROM cte
        )
        SELECT
            id
            ,label
        FROM ranked
        WHERE ranked.rn = 1
    """)
    DB.sql("""
        with cte as (
            select 
-                s.publisher
+                s.publisher_id
                ,count(1) as stories
            from stories s
-            group by s.publisher
+            group by s.publisher_id
        )
        select
            s.publisher
--- a/src/cli.py
+++ b/src/cli.py
@ -1,10 +1,12 @@
 import click
 from dotenv import load_dotenv
@click.group()
 def cli():
    ...
 if __name__ == "__main__":
    load_dotenv()
    import scrape
    cli.add_command(scrape.download)
    cli.add_command(scrape.parse)
--- a/src/emotion.py
+++ b/src/emotion.py
@ -9,6 +9,8 @@ from model import BertForMultiLabelClassification
 from data import connect
 import seaborn as sns
 import matplotlib.pyplot as plt
 from matplotlib.dates import DateFormatter
 import matplotlib.dates as mdates
 def data():
    # load data
@ -126,24 +128,153 @@ def normalize():
    """)
    DB.close()
@click.command("emotion:analyze")
 def coef_over_time():
    """plot and group emotional labels"""
    DB = connect()
    emotions = DB.sql("""
        select label from emotions
    """).df()
    from sklearn import linear_model
    from sklearn.model_selection import train_test_split
    def results(buckets = '1 month'):
        results = DB.sql(f"""
            with cte as (
                SELECT
                    time_bucket(interval '{buckets}', s.published_at) as date
                    ,e.label
                    ,COUNT(1) AS stories
                FROM stories s
                JOIN story_emotions se
                ON s.id = se.story_id
                JOIN emotions e
                ON e.id = se.emotion_id
                WHERE YEAR(s.published_at) < 2022
                GROUP BY 
                    time_bucket(interval '{buckets}', s.published_at)
                    ,e.label
            )
            ,total as (
                SELECT
                    time_bucket(interval '{buckets}', s.published_at) as date
                    ,COUNT(1) AS stories
                FROM stories s
                WHERE YEAR(s.published_at) < 2022
                GROUP BY 
                    time_bucket(interval '{buckets}', s.published_at)
            )
            select
                epoch(cte.date) / 60 / 60 / 24 / 365 as date
                ,cte.label
                ,cast(cte.stories as float) / t.stories as stories
            from cte
            join total t
            on t.date = cte.date
        """).df()
        return results
    def get_coef(label):
        reg = linear_model.LinearRegression()
        df = results[results['label'] == label]
        x = df['date'].to_numpy().reshape(-1, 1)
        y = df['stories']
        x_train, x_test = train_test_split(x)
        y_train, y_test = train_test_split(y)
        reg.fit(x_train, y_train)
        # y_pred = reg.predict(x_test)
        # sns.lineplot(x=x_test.flatten(), y=y_pred)
        return reg.coef_
    collection = []
    results = results('2 year')
    for emotion in emotions['label']:
        if emotion == 'neutral':
            continue
        coef = get_coef(emotion)[0]
        if coef > 0:
            increasing = True
        else:
            increasing = False
        collection.append({'emotion' : emotion, 'coef' : coef, 'increasing' : coef > 0 })
    pd.DataFrame(collection).sort_values('coef')
    plt.show()
@click.command("emotion:analyze")
 def analyze():
    """plot and group emotional labels"""
    DB = connect()
-    DB.sql("""
+
-        WITH grouped as (
+    emotions = DB.sql("""
        select label from emotions
    """).df()
    from sklearn import linear_model
    from sklearn.model_selection import train_test_split
    def get_coef(emotion):
        df = DB.sql("""
            with cte as (
                SELECT
-                YEAR(s.published_at) as year
+                    time_bucket(interval '1 month', s.published_at) as date
                    ,e.label
                    ,COUNT(1) AS stories
-            FROM story_emotions e
+                FROM stories s
-            JOIN stories s
+                JOIN story_emotions se
-            ON s.id = e.story_id
+                ON s.id = se.story_id
                JOIN emotions e
                ON e.id = se.emotion_id
                WHERE YEAR(s.published_at) < 2022
-            AND label = 'annoyance'
+                --AND e.label in ('neutral', 'annoyance')
                AND e.label in ('sadness')
                GROUP BY 
-                YEAR(s.published_at)
+                    time_bucket(interval '1 month', s.published_at)
                    ,e.label
            )
            ,total as (
                SELECT
                    time_bucket(interval '1 month', s.published_at) as date
                    ,COUNT(1) AS stories
                FROM stories s
                WHERE YEAR(s.published_at) < 2022
                GROUP BY 
                    time_bucket(interval '1 month', s.published_at)
            )
            select
                epoch(cte.date) as date
                ,cte.label
                --,total.stories as total
                ,cast(cte.stories as float) / e.stories as stories
            from cte
            join emotions e
            --on total.date = cte.date
            on e.label = cte.label
        """).df()
        reg = linear_model.LinearRegression()
        x = df['date'].to_numpy().reshape(-1, 1)
        y = df['stories']
        x_train, x_test = train_test_split(x)
        y_train, y_test = train_test_split(y)
        reg.fit(x_train, y_train)
        #y_pred = reg.predict(x_test)
        return reg.coef_
    df = DB.sql(f"""{yearly}""").df()
    df['date'] = pd.to_datetime(df['date'])
    ax = sns.lineplot(x=df['date'], y=df['stories'], hue=df['label'])
    #ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
    plt.locator_params(axis='y', nbins=6)
    ax.xaxis.set_major_formatter(DateFormatter("%m-%y"))
    plt.show()
    DB.sql("""
        WITH grouped as (
        ), total AS (
            SELECT
                e.label
--- a/src/links.py
+++ b/src/links.py
@ -0,0 +1,111 @@
 from data import connect
 import pandas as pd
 import numpy as np
 from sklearn.decomposition import PCA, TruncatedSVD
 from sklearn.cluster import MiniBatchKMeans
 import seaborn as sns
 import matplotlib.pyplot as plt
 def to_matrix():
    """returns an adjacency matrix of publishers to publisher link frequency"""
    DB = connect()
    bias_map = pd.DataFrame([
        {'label' :'left', 'value' : 0},
        {'label' :'left-center', 'value' : 1},
        {'label' :'center', 'value' : 2},
        {'label' :'right-center', 'value' : 3},
        {'label' :'right', 'value' : 4},
        {'label' :'allsides', 'value' : -1},
    ])
    bias = DB.sql("""
        SELECT
            b.id
            ,b.label
            ,m.value
        FROM publisher_bias b
        JOIN bias_map m
        ON b.label = m.label
        WHERE value != -1
    """).df()
    pub = DB.sql("""
            select 
                p.id
                ,p.name
                ,p.url
                ,b.label
                ,b.value
            from publishers p
            left join bias b
            on b.id = p.id
    """).df()
    edges = DB.sql("""
        WITH total as (
            SELECT
                s.publisher_id as id
                ,COUNT(1) as stories
            FROM stories s
            GROUP BY 
                s.publisher_id
        ), p as (
            SELECT
                p.id
                ,stories
            FROM publishers p
            LEFT JOIN total t
            ON t.id = p.id
            WHERE t.stories >= 20
        ), cte as (
            SELECT 
                r.publisher_id as child_id
                ,s.publisher_id as parent_id
                ,count(1) as links
            FROM related_stories r
            JOIN stories s
            ON s.id = r.parent_id
            group by 
                s.publisher_id
                ,r.publisher_id
        )
        SELECT
            p.id as parent_id
            ,cte.child_id
            ,links
        FROM p
        left JOIN cte
        ON p.id = cte.parent_id
    """).df()
    adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
    out = pd.DataFrame(adj.index.values, columns=['id'])
    out = pd.merge(out, pub, how='left', on='id')
    pca = PCA(n_components=4)
    pca_out = pca.fit_transform(adj)
    svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
    svd_out = svd.fit_transform(adj)
    x = svd_out[:, 0]
    y = svd_out[:, 1]
    x = pca_out[:, 0]
    y = pca_out[:, 1]
    sns.scatterplot(x=x, y=y)
    plt.show()
    kmeans = MiniBatchKMeans(n_clusters=5, random_state=0, batch_size=6, n_init="auto")
    pred = kmeans.fit_predict(pca_out)
    sns.scatterplot(x=x, y=y, hue=pred)
    plt.show()
    sns.scatterplot(x=pub['first'], y=pub['second'], hue=pub['bias'])
    plt.show()
--- a/src/scrape.py
+++ b/src/scrape.py
@ -7,6 +7,7 @@ from tqdm import tqdm
 from data import data_dir, connect
 from lxml import etree
 import pandas as pd
 from urllib.parse import urlparse
@click.command(name='scrape:load')
@click.option('--directory', type=Path, default=data_dir(), show_default=True)
@ -103,12 +104,14 @@ def parse(directory, output_dir):
            url = item.xpath('.//strong/a')[0].get('href')
            out['url'] = url
            out['publisher_url_domain'] = urlparse(publisher_url).netloc
            out['domain'] = urlparse(url).netloc
            item_id = hash((page.stem, url))
            out['id'] = item_id
-            old_id = hash((title, page.stem, publisher_url))
+            # old_id = hash((title, page.stem, publisher_url))
-            out['old_id'] = old_id
+            # out['old_id'] = old_id
            published.append(out)
            related = item.xpath(".//span[contains(@class, 'mls')]/a")
@ -118,6 +121,7 @@ def parse(directory, output_dir):
                another['url'] = relation.get('href')
                another['publisher'] = relation.text
                another['parent_id'] = item_id
                another['publisher_domain'] = urlparse(another['url']).netloc
                others.append(another)
    df = pd.DataFrame(published)
    df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
@ -128,6 +132,7 @@ def parse(directory, output_dir):
 def normalize():
    """fix database after load. remove duplicates. create publishers."""
    DB = connect()
    DB.sql("""
        DELETE FROM stories
        WHERE id IN (
@ -146,29 +151,77 @@ def normalize():
            OR title_ctn > 1
        )
    """)
    DB.sql("""
        CREATE OR REPLACE TABLE publishers AS
        with cte as (
            SELECT
-                s.publisher
+                s.publisher as name
-                ,s.publisher_url
+                ,s.publisher_url_domain as url
            FROM stories s
            GROUP BY 
                s.publisher
-                ,s.publisher_url
+                ,s.publisher_url_domain
        ), together AS (
            SELECT
-                COALESCE(cte.publisher, r.publisher) AS publisher
+                COALESCE(cte.name, r.publisher) AS name
-                ,cte.publisher_url
+                ,COALESCE(cte.url, r.publisher_domain) as url
            FROM cte
            FULL OUTER JOIN related_stories r
-            ON cte.publisher = r.publisher
+            ON cte.url = r.publisher_domain
        )
        SELECT
            ROW_NUMBER() OVER() as id
-            ,t.*
+            ,t.name
            ,t.url
        FROM together t
        where t.url is not null
        GROUP BY
-            publisher
+            name
-            ,publisher_url
+            ,url
    """)
    DB.sql("""
        alter table stories
        add column publisher_id bigint
    """)
    DB.sql("""
        update stories
        set publisher_id = publishers.id
        from publishers
        where publishers.url = stories.publisher_url_domain
    """)
    DB.sql("""
        alter table stories alter publisher_id set data type bigint
    """)
    DB.sql("""
        alter table stories drop publisher;
        alter table stories drop publisher_url;
        alter table stories drop publisher_url_domain;
        alter table stories drop domain;
    """)
    DB.sql("""
        alter table related_stories
        add column publisher_id bigint
    """)
    DB.sql("""
        update related_stories
        set publisher_id = publishers.id
        from publishers
        where publishers.url = related_stories.publisher_domain
    """)
    DB.sql("""
        alter table related_stories drop publisher;
        alter table related_stories drop publisher_domain;
    """)
--- a/src/word.py
+++ b/src/word.py
@ -81,3 +81,4 @@ def distance():
    min_index = (np.argmin(distances))
    closest = np.unravel_index(min_index, distances.shape)
    distances.flatten().shape
    DB.close()