From 4bd9f46eddb55342dc174f69320217ef9d4ff6c1 Mon Sep 17 00:00:00 2001 From: matt Date: Sun, 7 May 2023 22:07:26 -0700 Subject: [PATCH] add better normalization. add link similarity. --- docs/progress.md | 28 +++++++++ src/bias.py | 40 ++++++++++-- src/cli.py | 2 + src/emotion.py | 155 +++++++++++++++++++++++++++++++++++++++++++---- src/links.py | 111 +++++++++++++++++++++++++++++++++ src/scrape.py | 75 +++++++++++++++++++---- src/word.py | 1 + 7 files changed, 383 insertions(+), 29 deletions(-) create mode 100644 src/links.py diff --git a/docs/progress.md b/docs/progress.md index 2ffbac9..4e618ba 100644 --- a/docs/progress.md +++ b/docs/progress.md @@ -1,5 +1,29 @@ # Data Mining - CSCI 577 +# Project Status Report IV + +*2023-04-25* + +This project report will take the form of an initial draft of the final report, making use of the template discussed in class and made available on Canvas. Minimally, this draft should include the following: + +1. Data preparation +2. Policy for dealing with missing attribute values +3. If your project is one of classification, discuss: + a. Intelligent discretization + b. Identification of useless attributes + c. Policy for violations of the adequacy condition and missing + attribute values +4. If your project is one of clustering: + a. Elimination of noise attributes + b. Proper choice or development of distance measures +5. If your project is one of association rule analysis: + a. What are the "market baskets"? + b. How are thresholds for support and confidence developed. +6. In all cases, you should specify: + a. What computational experiments you have conducted, or plan to + conduct. + + # Project Status Report III *2023-04-18* @@ -35,6 +59,10 @@ I will use the following suite of python tools to conduct my research: > This progress should also provide a definitive description of your purpose and how you intend to conduct it. > This should take the form of a detailed outline of the procedures you will undertake in exploring your dataset(s) and maximizing the knowledge that can be extracted from it. +The ultimate purpose of the project is track the progress of political discourse as a function of time and publisher. +Using a dataset of article titles and publications, the aim of the project is to classify article titles using a sentiment analysis language model. + + \newpage # Project Status Report II diff --git a/src/bias.py b/src/bias.py index b972d42..5347aa7 100644 --- a/src/bias.py +++ b/src/bias.py @@ -7,14 +7,16 @@ import os import csv def map(rating:str) -> int: + mapping = { - 'right' : 0, + 'left' : 0, 'left-center' : 1, 'center' : 2, - 'left' : 3, - 'allsides' : 4, - 'right-center' : 5 + 'right-center' : 3, + 'right' : 4, + 'allsides' : -1, } + return mapping[rating] @@ -35,13 +37,39 @@ def load() -> None: def normalize() -> None: DB = connect() + DB.sql(""" + CREATE OR REPLACE TABLE publisher_bias AS + WITH cte AS ( + SELECT + p.id + ,b.bias as label + ,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity + FROM bias_ratings b + JOIN publishers p + ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95 + ),ranked AS ( + SELECT + id + ,label + ,similarity + ,ROW_NUMBER() OVER(PARTITION BY id ORDER BY similarity DESC) AS rn + FROM cte + ) + SELECT + id + ,label + FROM ranked + WHERE ranked.rn = 1 + """) + + DB.sql(""" with cte as ( select - s.publisher + s.publisher_id ,count(1) as stories from stories s - group by s.publisher + group by s.publisher_id ) select s.publisher diff --git a/src/cli.py b/src/cli.py index 64b93f8..a79699c 100644 --- a/src/cli.py +++ b/src/cli.py @@ -1,10 +1,12 @@ import click +from dotenv import load_dotenv @click.group() def cli(): ... if __name__ == "__main__": + load_dotenv() import scrape cli.add_command(scrape.download) cli.add_command(scrape.parse) diff --git a/src/emotion.py b/src/emotion.py index 363c975..70a7757 100644 --- a/src/emotion.py +++ b/src/emotion.py @@ -9,6 +9,8 @@ from model import BertForMultiLabelClassification from data import connect import seaborn as sns import matplotlib.pyplot as plt +from matplotlib.dates import DateFormatter +import matplotlib.dates as mdates def data(): # load data @@ -126,24 +128,153 @@ def normalize(): """) DB.close() +@click.command("emotion:analyze") +def coef_over_time(): + """plot and group emotional labels""" + DB = connect() + + emotions = DB.sql(""" + select label from emotions + """).df() + + from sklearn import linear_model + from sklearn.model_selection import train_test_split + + def results(buckets = '1 month'): + results = DB.sql(f""" + with cte as ( + SELECT + time_bucket(interval '{buckets}', s.published_at) as date + ,e.label + ,COUNT(1) AS stories + FROM stories s + JOIN story_emotions se + ON s.id = se.story_id + JOIN emotions e + ON e.id = se.emotion_id + WHERE YEAR(s.published_at) < 2022 + GROUP BY + time_bucket(interval '{buckets}', s.published_at) + ,e.label + ) + ,total as ( + SELECT + time_bucket(interval '{buckets}', s.published_at) as date + ,COUNT(1) AS stories + FROM stories s + WHERE YEAR(s.published_at) < 2022 + GROUP BY + time_bucket(interval '{buckets}', s.published_at) + ) + select + epoch(cte.date) / 60 / 60 / 24 / 365 as date + ,cte.label + ,cast(cte.stories as float) / t.stories as stories + from cte + join total t + on t.date = cte.date + """).df() + return results + + + def get_coef(label): + reg = linear_model.LinearRegression() + df = results[results['label'] == label] + x = df['date'].to_numpy().reshape(-1, 1) + y = df['stories'] + x_train, x_test = train_test_split(x) + y_train, y_test = train_test_split(y) + reg.fit(x_train, y_train) + # y_pred = reg.predict(x_test) + # sns.lineplot(x=x_test.flatten(), y=y_pred) + return reg.coef_ + + collection = [] + results = results('2 year') + for emotion in emotions['label']: + if emotion == 'neutral': + continue + coef = get_coef(emotion)[0] + if coef > 0: + increasing = True + else: + increasing = False + collection.append({'emotion' : emotion, 'coef' : coef, 'increasing' : coef > 0 }) + pd.DataFrame(collection).sort_values('coef') + + plt.show() + @click.command("emotion:analyze") def analyze(): """plot and group emotional labels""" DB = connect() + + emotions = DB.sql(""" + select label from emotions + """).df() + + from sklearn import linear_model + from sklearn.model_selection import train_test_split + def get_coef(emotion): + df = DB.sql(""" + with cte as ( + SELECT + time_bucket(interval '1 month', s.published_at) as date + ,e.label + ,COUNT(1) AS stories + FROM stories s + JOIN story_emotions se + ON s.id = se.story_id + JOIN emotions e + ON e.id = se.emotion_id + WHERE YEAR(s.published_at) < 2022 + --AND e.label in ('neutral', 'annoyance') + AND e.label in ('sadness') + GROUP BY + time_bucket(interval '1 month', s.published_at) + ,e.label + ) + ,total as ( + SELECT + time_bucket(interval '1 month', s.published_at) as date + ,COUNT(1) AS stories + FROM stories s + WHERE YEAR(s.published_at) < 2022 + GROUP BY + time_bucket(interval '1 month', s.published_at) + ) + select + epoch(cte.date) as date + ,cte.label + --,total.stories as total + ,cast(cte.stories as float) / e.stories as stories + from cte + join emotions e + --on total.date = cte.date + on e.label = cte.label + """).df() + + reg = linear_model.LinearRegression() + x = df['date'].to_numpy().reshape(-1, 1) + y = df['stories'] + + x_train, x_test = train_test_split(x) + y_train, y_test = train_test_split(y) + reg.fit(x_train, y_train) + #y_pred = reg.predict(x_test) + return reg.coef_ + + + df = DB.sql(f"""{yearly}""").df() + df['date'] = pd.to_datetime(df['date']) + ax = sns.lineplot(x=df['date'], y=df['stories'], hue=df['label']) + #ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=2)) + plt.locator_params(axis='y', nbins=6) + ax.xaxis.set_major_formatter(DateFormatter("%m-%y")) + plt.show() + DB.sql(""" WITH grouped as ( - SELECT - YEAR(s.published_at) as year - ,e.label - ,COUNT(1) AS stories - FROM story_emotions e - JOIN stories s - ON s.id = e.story_id - WHERE YEAR(s.published_at) < 2022 - AND label = 'annoyance' - GROUP BY - YEAR(s.published_at) - ,e.label ), total AS ( SELECT e.label diff --git a/src/links.py b/src/links.py new file mode 100644 index 0000000..93e7cfa --- /dev/null +++ b/src/links.py @@ -0,0 +1,111 @@ +from data import connect +import pandas as pd +import numpy as np +from sklearn.decomposition import PCA, TruncatedSVD +from sklearn.cluster import MiniBatchKMeans +import seaborn as sns +import matplotlib.pyplot as plt + + + +def to_matrix(): + """returns an adjacency matrix of publishers to publisher link frequency""" + + DB = connect() + + bias_map = pd.DataFrame([ + {'label' :'left', 'value' : 0}, + {'label' :'left-center', 'value' : 1}, + {'label' :'center', 'value' : 2}, + {'label' :'right-center', 'value' : 3}, + {'label' :'right', 'value' : 4}, + {'label' :'allsides', 'value' : -1}, + ]) + bias = DB.sql(""" + SELECT + b.id + ,b.label + ,m.value + FROM publisher_bias b + JOIN bias_map m + ON b.label = m.label + WHERE value != -1 + """).df() + + pub = DB.sql(""" + select + p.id + ,p.name + ,p.url + ,b.label + ,b.value + from publishers p + left join bias b + on b.id = p.id + """).df() + + edges = DB.sql(""" + WITH total as ( + SELECT + s.publisher_id as id + ,COUNT(1) as stories + FROM stories s + GROUP BY + s.publisher_id + ), p as ( + SELECT + p.id + ,stories + FROM publishers p + LEFT JOIN total t + ON t.id = p.id + WHERE t.stories >= 20 + ), cte as ( + SELECT + r.publisher_id as child_id + ,s.publisher_id as parent_id + ,count(1) as links + FROM related_stories r + JOIN stories s + ON s.id = r.parent_id + group by + s.publisher_id + ,r.publisher_id + ) + SELECT + p.id as parent_id + ,cte.child_id + ,links + FROM p + left JOIN cte + ON p.id = cte.parent_id + """).df() + + adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0) + + + out = pd.DataFrame(adj.index.values, columns=['id']) + out = pd.merge(out, pub, how='left', on='id') + + pca = PCA(n_components=4) + pca_out = pca.fit_transform(adj) + + svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42) + svd_out = svd.fit_transform(adj) + + x = svd_out[:, 0] + y = svd_out[:, 1] + + x = pca_out[:, 0] + y = pca_out[:, 1] + sns.scatterplot(x=x, y=y) + plt.show() + + kmeans = MiniBatchKMeans(n_clusters=5, random_state=0, batch_size=6, n_init="auto") + pred = kmeans.fit_predict(pca_out) + + sns.scatterplot(x=x, y=y, hue=pred) + plt.show() + + sns.scatterplot(x=pub['first'], y=pub['second'], hue=pub['bias']) + plt.show() diff --git a/src/scrape.py b/src/scrape.py index 66e96d0..f3a285c 100644 --- a/src/scrape.py +++ b/src/scrape.py @@ -7,6 +7,7 @@ from tqdm import tqdm from data import data_dir, connect from lxml import etree import pandas as pd +from urllib.parse import urlparse @click.command(name='scrape:load') @click.option('--directory', type=Path, default=data_dir(), show_default=True) @@ -103,12 +104,14 @@ def parse(directory, output_dir): url = item.xpath('.//strong/a')[0].get('href') out['url'] = url + out['publisher_url_domain'] = urlparse(publisher_url).netloc + out['domain'] = urlparse(url).netloc item_id = hash((page.stem, url)) out['id'] = item_id - old_id = hash((title, page.stem, publisher_url)) - out['old_id'] = old_id + # old_id = hash((title, page.stem, publisher_url)) + # out['old_id'] = old_id published.append(out) related = item.xpath(".//span[contains(@class, 'mls')]/a") @@ -118,6 +121,7 @@ def parse(directory, output_dir): another['url'] = relation.get('href') another['publisher'] = relation.text another['parent_id'] = item_id + another['publisher_domain'] = urlparse(another['url']).netloc others.append(another) df = pd.DataFrame(published) df.to_csv(output_dir / 'stories.csv', sep='|', index=False) @@ -128,6 +132,7 @@ def parse(directory, output_dir): def normalize(): """fix database after load. remove duplicates. create publishers.""" DB = connect() + DB.sql(""" DELETE FROM stories WHERE id IN ( @@ -146,29 +151,77 @@ def normalize(): OR title_ctn > 1 ) """) + + + DB.sql(""" CREATE OR REPLACE TABLE publishers AS with cte as ( SELECT - s.publisher - ,s.publisher_url + s.publisher as name + ,s.publisher_url_domain as url FROM stories s GROUP BY s.publisher - ,s.publisher_url + ,s.publisher_url_domain ), together AS ( SELECT - COALESCE(cte.publisher, r.publisher) AS publisher - ,cte.publisher_url + COALESCE(cte.name, r.publisher) AS name + ,COALESCE(cte.url, r.publisher_domain) as url FROM cte FULL OUTER JOIN related_stories r - ON cte.publisher = r.publisher + ON cte.url = r.publisher_domain ) SELECT ROW_NUMBER() OVER() as id - ,t.* + ,t.name + ,t.url FROM together t + where t.url is not null GROUP BY - publisher - ,publisher_url + name + ,url """) + + DB.sql(""" + alter table stories + add column publisher_id bigint + """) + + DB.sql(""" + update stories + set publisher_id = publishers.id + from publishers + where publishers.url = stories.publisher_url_domain + """) + + DB.sql(""" + alter table stories alter publisher_id set data type bigint + """) + + + DB.sql(""" + alter table stories drop publisher; + alter table stories drop publisher_url; + alter table stories drop publisher_url_domain; + alter table stories drop domain; + """) + + DB.sql(""" + alter table related_stories + add column publisher_id bigint + """) + + + DB.sql(""" + update related_stories + set publisher_id = publishers.id + from publishers + where publishers.url = related_stories.publisher_domain + """) + + DB.sql(""" + alter table related_stories drop publisher; + alter table related_stories drop publisher_domain; + """) + diff --git a/src/word.py b/src/word.py index 980787b..2d8d1f3 100644 --- a/src/word.py +++ b/src/word.py @@ -81,3 +81,4 @@ def distance(): min_index = (np.argmin(distances)) closest = np.unravel_index(min_index, distances.shape) distances.flatten().shape + DB.close()