diff --git a/docs/figures/bias_hist.png b/docs/figures/bias_hist.png index 056d985..33e7a67 100644 Binary files a/docs/figures/bias_hist.png and b/docs/figures/bias_hist.png differ diff --git a/docs/figures/bias_over_time.png b/docs/figures/bias_over_time.png new file mode 100644 index 0000000..c52b944 Binary files /dev/null and b/docs/figures/bias_over_time.png differ diff --git a/docs/figures/bias_publisher_hist.png b/docs/figures/bias_publisher_hist.png index b0dae0a..e351743 100644 Binary files a/docs/figures/bias_publisher_hist.png and b/docs/figures/bias_publisher_hist.png differ diff --git a/docs/figures/bias_vs_recent_winner.png b/docs/figures/bias_vs_recent_winner.png index 7da9404..7db5220 100644 Binary files a/docs/figures/bias_vs_recent_winner.png and b/docs/figures/bias_vs_recent_winner.png differ diff --git a/docs/figures/bias_vs_sentiment_over_time.png b/docs/figures/bias_vs_sentiment_over_time.png index 18d0b00..2888901 100644 Binary files a/docs/figures/bias_vs_sentiment_over_time.png and b/docs/figures/bias_vs_sentiment_over_time.png differ diff --git a/docs/figures/emotion_hist.png b/docs/figures/emotion_hist.png new file mode 100644 index 0000000..49cdaed Binary files /dev/null and b/docs/figures/emotion_hist.png differ diff --git a/docs/figures/emotion_over_time.png b/docs/figures/emotion_over_time.png index 190d285..c49a7fd 100644 Binary files a/docs/figures/emotion_over_time.png and b/docs/figures/emotion_over_time.png differ diff --git a/docs/figures/emotion_regression.png b/docs/figures/emotion_regression.png index b3ff0d0..3cbee88 100644 Binary files a/docs/figures/emotion_regression.png and b/docs/figures/emotion_regression.png differ diff --git a/docs/figures/link_links_pca_with_classes.png b/docs/figures/link_links_pca_with_classes.png new file mode 100644 index 0000000..66bfe3c Binary files /dev/null and b/docs/figures/link_links_pca_with_classes.png differ diff --git a/docs/figures/link_normalized_pca_with_classes.png b/docs/figures/link_normalized_pca_with_classes.png new file mode 100644 index 0000000..980b2b5 Binary files /dev/null and b/docs/figures/link_normalized_pca_with_classes.png differ diff --git a/docs/figures/link_onehot_pca_with_classes.png b/docs/figures/link_onehot_pca_with_classes.png new file mode 100644 index 0000000..1c4e179 Binary files /dev/null and b/docs/figures/link_onehot_pca_with_classes.png differ diff --git a/docs/figures/link_{source}_pca_with_classes.png b/docs/figures/link_{source}_pca_with_classes.png new file mode 100644 index 0000000..66bfe3c Binary files /dev/null and b/docs/figures/link_{source}_pca_with_classes.png differ diff --git a/docs/figures/pca_with_classes.png b/docs/figures/pca_with_classes.png index a0362ef..1c4e179 100644 Binary files a/docs/figures/pca_with_classes.png and b/docs/figures/pca_with_classes.png differ diff --git a/docs/figures/sentiment_hist.png b/docs/figures/sentiment_hist.png new file mode 100644 index 0000000..0b2365a Binary files /dev/null and b/docs/figures/sentiment_hist.png differ diff --git a/docs/figures/sentiment_over_time.png b/docs/figures/sentiment_over_time.png index c5be0f8..815b882 100644 Binary files a/docs/figures/sentiment_over_time.png and b/docs/figures/sentiment_over_time.png differ diff --git a/docs/figures/stories_per_publisher.png b/docs/figures/stories_per_publisher.png index f63d983..5016512 100644 Binary files a/docs/figures/stories_per_publisher.png and b/docs/figures/stories_per_publisher.png differ diff --git a/docs/figures/top_publishers.png b/docs/figures/top_publishers.png index 8961cb7..4abbeb6 100644 Binary files a/docs/figures/top_publishers.png and b/docs/figures/top_publishers.png differ diff --git a/src/apriori.py b/src/apriori.py new file mode 100644 index 0000000..601ae6e --- /dev/null +++ b/src/apriori.py @@ -0,0 +1,27 @@ +from efficient_apriori import apriori +from data.main import connect + +@click.command("apriori:rules") +def rules(): + DB = connect() + data = DB.query(""" + SELECT + --list_prepend(parent.id, list(child.id)) as transaction + list_prepend(parent.tld, list(child.tld)) as transaction + FROM stories s + JOIN related_stories r + ON r.parent_id = s.id + JOIN publishers parent + ON parent.id = s.publisher_id + JOIN publishers child + ON child.id = r.publisher_id + GROUP BY + --parent.id + parent.tld + """).df() + DB.close() + + transactions = data.transaction.apply(lambda x: tuple(x)).values + + itemsets, rules = apriori(transactions, min_support=0.1, min_confidence=0.8) + print(*rules, sep="\n") diff --git a/src/bias.py b/src/bias.py index 92f952a..f023fc9 100644 --- a/src/bias.py +++ b/src/bias.py @@ -1,67 +1,42 @@ import click -from data.main import connect +from data.main import connect, paths import pandas as pd from lxml import etree from pathlib import Path import os import csv -def label_to_int(rating:str) -> int: - - mapping = { - 'left' : 0, - 'left-center' : 1, - 'center' : 2, - 'right-center' : 3, - 'right' : 4, - 'allsides' : -1, - } - - return mapping[rating] - -def int_to_label(class_id: int) -> str: - mapping = { - 0 : 'left', - 1 : 'left-center', - 2 : 'center', - 3 : 'right-center', - 4 : 'right', - -1 : 'allsides', - } - return mapping[class_id] - @click.command(name="bias:normalize") def normalize() -> None: - DB = connect() - - DB.sql(""" - CREATE OR REPLACE TABLE publisher_bias AS - WITH cte AS ( - SELECT - p.id as publisher_id - ,b.id as bias_id - ,b.bias as label - ,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity - FROM bias_ratings b - JOIN top.publishers p - ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95 - ),ranked AS ( + with connect() as db: + db.sql(""" + CREATE OR REPLACE TABLE publisher_bias AS + WITH cte AS ( + SELECT + p.id as publisher_id + ,b.id as bias_id + ,b.bias as label + ,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity + FROM bias_ratings b + JOIN top.publishers p + ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95 + ),ranked AS ( + SELECT + publisher_id + ,bias_id + ,label + ,similarity + ,ROW_NUMBER() OVER(PARTITION BY publisher_id ORDER BY similarity DESC) AS rn + FROM cte + ) SELECT publisher_id - ,bias_id ,label - ,similarity - ,ROW_NUMBER() OVER(PARTITION BY publisher_id ORDER BY similarity DESC) AS rn - FROM cte - ) - SELECT - publisher_id - ,label - ,bias_id - FROM ranked - WHERE ranked.rn = 1 - """) + ,bias_id + FROM ranked + WHERE ranked.rn = 1 + """) mapping = [ {'label' :'left' , 'ordinal': -2}, @@ -72,22 +47,20 @@ def normalize() -> None: ] mapping = pd.DataFrame(mapping) - DB.query("alter table bias_ratings add column ordinal int") - - DB.query(""" - update bias_ratings b - set ordinal = o.ordinal - FROM mapping o - WHERE o.label = b.bias - """) + with connect() as db: + db.query("alter table bias_ratings add column ordinal int") + db.query(""" + update bias_ratings b + set ordinal = o.ordinal + FROM mapping o + WHERE o.label = b.bias + """) @click.command(name='bias:parse') def parse() -> None: """parse the save html page of allslides.com bias ratings into a normalized csv file""" - DB = connect() - DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) - bias_html = DATA_DIR / 'allsides.html' + bias_html = paths('data') / 'allsides.html' parser = etree.HTMLParser() tree = etree.parse(str(bias_html), parser) @@ -111,65 +84,63 @@ def parse() -> None: rating['disagree'] = int(disagree) ratings.append(rating) df = pd.DataFrame(ratings) - df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC) + df.to_csv(paths('data') / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC) @click.command(name="bias:load") def load() -> None: - DB = connect() - DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) - f = str(DATA_DIR / "bias_ratings.csv") + f = str(paths('data') / "bias_ratings.csv") - DB.sql(f""" - CREATE TABLE bias_ratings as - select - row_number() over(order by b.publisher) as id - ,b.* - from read_csv_auto('{f}') b - """) + with connect() as db: + db.sql(f""" + CREATE TABLE bias_ratings as + select + row_number() over(order by b.publisher) as id + ,b.* + from read_csv_auto('{f}') b + """) @click.command('bias:export') def export(): - data_path = Path(os.environ['DATA_MINING_DATA_DIR']) + with connect() as db: + all_bias = db.query(""" + SELECT + id as bias_id + ,publisher as name + ,bias as label + FROM bias_ratings + ORDER by agree desc + """) - DB = connect() - all_bias = DB.query(""" - SELECT - id as bias_id - ,publisher as name - ,bias as label - FROM bias_ratings - ORDER by agree desc + all_bias.df().to_csv(paths('data') / 'TMP_publisher_bias.csv', sep="|", index=False) + with connect() as db: + mapped_bias = db.query(""" + SELECT + p.id as publisher_id + ,p.name as name + ,p.tld as tld + ,b.label as bias + ,b.bias_id as bias_id + FROM top.publishers p + LEFT JOIN publisher_bias b + ON b.publisher_id = p.id """) - all_bias.df().to_csv(data_path / 'TMP_publisher_bias.csv', sep="|", index=False) - mapped_bias = DB.query(""" - SELECT - p.id as publisher_id - ,p.name as name - ,p.tld as tld - ,b.label as bias - ,b.bias_id as bias_id - FROM top.publishers p - LEFT JOIN publisher_bias b - ON b.publisher_id = p.id - """) - mapped_bias.df().to_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|", index=False) - DB.close() + mapped_bias.df().to_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|", index=False) @click.command('bias:import-mapped') def import_mapped(): - data_path = Path(os.environ['DATA_MINING_DATA_DIR']) table_name = "top.publisher_bias" - DB = connect() - df = pd.read_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|") + df = pd.read_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|") + + with connect() as db: + db.query(f""" + CREATE OR REPLACE TABLE {table_name} AS + SELECT + publisher_id AS publisher_id + ,cast(bias_id AS int) as bias_id + FROM df + WHERE bias_id IS NOT NULL + """) - DB.query(f""" - CREATE OR REPLACE TABLE {table_name} AS - SELECT - publisher_id AS publisher_id - ,cast(bias_id AS int) as bias_id - FROM df - WHERE bias_id IS NOT NULL - """) print(f"created table: {table_name}") diff --git a/src/cli.py b/src/cli.py index a85457f..b813232 100644 --- a/src/cli.py +++ b/src/cli.py @@ -1,5 +1,7 @@ import click from dotenv import load_dotenv +import data +import plots @click.group() def cli(): @@ -7,12 +9,20 @@ def cli(): if __name__ == "__main__": load_dotenv() - from data import scrape - cli.add_command(scrape.download) - cli.add_command(scrape.parse) - cli.add_command(scrape.load) - cli.add_command(scrape.normalize) - cli.add_command(scrape.create_elections_table) + + # original bias ratings + cli.add_command(data.scrape.download) + cli.add_command(data.scrape.parse) + cli.add_command(data.scrape.load) + cli.add_command(data.scrape.normalize) + cli.add_command(data.scrape.create_elections_table) + + cli.add_command(data.factcheck.parse_index) + cli.add_command(data.factcheck.scrape) + + cli.add_command(data.links.create_table) + cli.add_command(data.links.create_pca) + cli.add_command(data.links.create_clusters) import word # cli.add_command(word.distance) @@ -23,10 +33,12 @@ if __name__ == "__main__": cli.add_command(bias.parse) cli.add_command(bias.load) cli.add_command(bias.normalize) + import mine cli.add_command(mine.embeddings) cli.add_command(mine.cluster) cli.add_command(mine.plot) + import emotion cli.add_command(emotion.extract) cli.add_command(emotion.normalize) @@ -40,34 +52,20 @@ if __name__ == "__main__": from train import main as train_main cli.add_command(train_main.main) - import plots.descriptive as plotd - cli.add_command(plotd.articles_per_year) - cli.add_command(plotd.distinct_publishers) - cli.add_command(plotd.stories_per_publisher) - cli.add_command(plotd.top_publishers) - cli.add_command(plotd.common_tld) - - import links as linkcli - cli.add_command(linkcli.create_table) - cli.add_command(linkcli.create_pca) - cli.add_command(linkcli.create_clusters) - - import plots.links as plotl - cli.add_command(plotl.elbow) - cli.add_command(plotl.link_pca_clusters) - - import plots.classifier as plotc - cli.add_command(plotc.pca_with_classes) - - import plots + cli.add_command(plots.descriptive.articles_per_year) + cli.add_command(plots.descriptive.distinct_publishers) + cli.add_command(plots.descriptive.stories_per_publisher) + cli.add_command(plots.descriptive.top_publishers) + cli.add_command(plots.descriptive.common_tld) cli.add_command(plots.sentence.sentence_pca) cli.add_command(plots.sentence.avg_sentence_pca) cli.add_command(plots.emotion.emotion_over_time) cli.add_command(plots.emotion.emotion_regression) - cli.add_command(plots.sentiment.over_time) cli.add_command(plots.sentiment.bias_over_time) cli.add_command(plots.sentiment.bias_vs_recent_winner) - + cli.add_command(plots.links.elbow) + cli.add_command(plots.links.link_pca_clusters) + cli.add_command(plots.classifier.pca_with_classes) cli() diff --git a/src/data/__init__.py b/src/data/__init__.py index 0c64a7f..ec59aa8 100644 --- a/src/data/__init__.py +++ b/src/data/__init__.py @@ -1,6 +1,10 @@ import data.main import data.scrape +import data.factcheck +import data.links __all__ = [ 'main' ,'scrape' + ,'factcheck' + ,'links' ] diff --git a/src/data/factcheck.py b/src/data/factcheck.py new file mode 100644 index 0000000..6538010 --- /dev/null +++ b/src/data/factcheck.py @@ -0,0 +1,171 @@ +import requests +from lxml import etree +from bs4 import BeautifulSoup +import re +from io import BytesIO +import pandas as pd +from pathlib import Path +import os +import sys +import click +from data.main import connect, map_tld, paths +from random import randint +from time import sleep +from tqdm import tqdm + + +@click.command('mbfc:parse-index') +def parse_index(): + parser = etree.HTMLParser() + publishers = [] + for page in range(1, 54): + url = f"https://mediabiasfactcheck.com/filtered-search/?pg={page}" + print(f"downloading {url}", file=sys.stderr) + response = requests.get(url) + html = response.content + tree = etree.parse(BytesIO(html), parser) + rows = tree.xpath('//table[@class="mbfc-table"]/tbody/tr') + print(f"parsing {len(rows)} rows", file=sys.stderr) + for row in rows: + publisher = {} + link, bias, reporting, country, credibility, media_type, traffic, popularity = tuple(col for col in row.iterchildren()) + link = link.xpath('./a')[0] + publisher['name'] = link.text + publisher['detail_url'] = link.get('href') + publisher['bias'] = bias.text + publisher['reporting'] = reporting.text + publisher['country'] = country.text + publisher['credibility'] = credibility.text + publisher['media_type'] = media_type.text + publisher['traffic'] = traffic.text + publisher['popularity'] = popularity.xpath('./span')[0].text + publishers.append(publisher) + df = pd.DataFrame(publishers) + save_to = paths('data') / 'mbfc_bias.csv' + df.to_csv(save_to, sep='|', index=False) + print(f"saved {len(df)}: {save_to}", file=sys.stderr) + +@click.command("mbfc:schema") +def schema(): + with connect() as db: + db.sql("""create schema mbfc""") + db.sql("""create or replace table mbfc.scrape ( + url text + ,scraped_at datetime default now() + ) + """) + +@click.command("mbfc:scrape") +def scrape(): + + df = pd.read_csv(paths('data') / 'mbfc_bias.csv', sep="|") + + with connect() as db: + stats = db.query(""" + select + count(1) filter(where s.url is not null) as elapsed + ,count(1) filter(where s.url is null) as remaining + from df + left join mbfc.scrape s + on df.detail_url = s.url + """).fetchall() + df = db.query(""" + select + detail_url as url + from df + where df.detail_url not in ( + select + url + from mbfc.scrape + ) + """).df() + print(f"{stats[0][0]} elapsed. {stats[0][1]} remaining.") + + for url in df.url: + delay = randint(1,3) + save_as = paths('data') / 'mbfc' / (url.strip('/').split('/')[-1] + '.html') + print(f"downloading (delay: {delay}): {url}", file=sys.stderr) + sleep(delay) + try: + response = requests.get(url) + except Exception as e: + print(f"request failed: {url}", file=sys.stderr) + continue + with open(save_as, 'w') as f: + f.write(response.text) + with connect() as db: + db.execute("""insert into mbfc.scrape (url) values (?)""", [url]) + print(f"saved: {save_as}", file=sys.stderr) + +def load(): + + publishers = [] + for i, page in enumerate(tqdm((paths('data') / 'mbfc').iterdir())): + publisher = {} + publisher['origin_url'] = f"https://mediabiasfactcheck.com/{page.stem}" + with page.open() as p: + tree = BeautifulSoup(p, 'html.parser') + for e in tree(string=re.compile(r'source:', re.IGNORECASE)): + e = e.parent + while e.name != 'p': + e = e.parent + l = e.find('a') + if l: + publisher['tld'] = l.get('href') + break + else: + breakpoint() + publishers.append(publisher) + df = pd.DataFrame(publishers) + df.to_csv(paths('data') / 'mbfc_publisher_url.csv', index=False, sep="|") + +@click.command('mbfc:create-tables') +def create_tables(): + + pubs = pd.read_csv(paths('data') / 'mbfc_publishers.csv', sep='|') + urls = pd.read_csv(paths('data') / 'mbfc_publisher_url.csv', sep="|") + df = pubs.merge(urls, on='mbfc_url') + df['tld'] = df.tld.apply(map_tld) + df['ordinal'] = df.bias.apply(bias_label_to_int) + + with connect() as db: + db.sql(""" + CREATE OR REPLACE TABLE mbfc.publishers AS + SELECT + row_number() over() as id + ,p.tld + ,mode(p.name) as name + ,mode(p.bias) as bias + ,mode(p.ordinal) as ordinal + ,mode(p.reporting) as reporting + ,mode(p.country) as country + ,mode(p.credibility) as credibility + ,mode(p.media_type) as media_type + ,mode(p.traffic) as traffic + ,mode(p.popularity) as popularity + FROM df p + GROUP BY + p.tld + """) + + with connect() as db: + raw_stories = db.sql(""" + SELECT + * + FROM stories s + """).df() + + stories['tld'] = stories.url.apply(map_tld) + + with connect() as db: + db.sql(""" + CREATE OR REPLACE TABLE mbfc.publisher_stories AS + SELECT + s.id as story_id + ,p.id as publisher_id + FROM raw_stories s + JOIN mbfc.publishers p + ON p.tld = s.tld + """) + + diff --git a/src/data/links.py b/src/data/links.py new file mode 100644 index 0000000..94a335c --- /dev/null +++ b/src/data/links.py @@ -0,0 +1,135 @@ +import click +from data.main import connect +import pandas as pd + +@click.command('links:create-table') +def create_table(): + + with connect() as db: + db.query(f""" + CREATE OR REPLACE TABLE link_edges AS + with cte as( + SELECT + s.publisher_id as parent_id + ,r.publisher_id as child_id + ,count(1) as links + FROM stories s + JOIN related_stories r + ON s.id = r.parent_id + group by + s.publisher_id + ,r.publisher_id + ) + SELECT + cte.parent_id + ,cte.child_id + ,cte.links as links + ,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized + ,case when cte.links > 0 then 1 else 0 end as onehot + FROM cte + WHERE cte.child_id in ( + SELECT + distinct parent_id + FROM cte + ) + AND cte.parent_id in ( + SELECT + distinct child_id + FROM cte + ) + """) + + db.query(""" + SELECT + * + ,count(1) over() + FROM link_edges e + limit 1 + """) + + print(f"created link_edges") + +@click.command('links:create-pca') +@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links') +def create_pca(source): + """create 2D pca labels""" + from sklearn.decomposition import PCA + + table_name = f"publisher_pca_{source}" + + with connect() as db: + pub = db.query(""" + SELECT + p.* + FROM mbfc.publishers p + JOIN mbfc.publisher_stories ps + ON p.id = ps.publisher_id + """).df() + df = db.query(f""" + SELECT + parent_id + ,child_id + ,{source} as links + FROM link_edges + """).df() + + pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0) + svd = PCA(n_components=2) + svd_out = svd.fit_transform(pivot) + out = pivot.reset_index()[['parent_id']] + out['first'] = svd_out[:, 0] + out['second'] = svd_out[:, 1] + out = pd.merge(out, pub, left_on='parent_id', right_on='id') + + with connect() as db: + db.query(f""" + CREATE OR REPLACE TABLE {table_name} AS + SELECT + out.id as publisher_id + ,out.first as first + ,out.second as second + FROM out + """) + + print(f"created {table_name}") + + +@click.command('links:create-clusters') +@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links') +def create_clusters(source): + """create link adj. matrix clusters table""" + from sklearn.cluster import KMeans + + table_name = f"publisher_clusters_{source}" + with connect() as db: + df = db.query(f""" + SELECT + parent_id + ,child_id + ,{source} as links + FROM link_edges + """).df() + pub = db.query(""" + SELECT + p.* + FROM mbfc.publishers p + JOIN mbfc.publisher_stories ps + ON ps.publisher_id = p.id + """).df() + pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0) + k = 8 + kmeans = KMeans(n_clusters=k, n_init="auto") + pred = kmeans.fit_predict(pivot) + out = pivot.reset_index()[['parent_id']] + out['label'] = pred + out = pd.merge(out, pub, left_on='parent_id', right_on='id') + new_table = out[['id', 'label']] + with connect() as db: + db.query(f""" + CREATE OR REPLACE TABLE {table_name} AS + SELECT + n.id as publisher_id + ,n.label as label + FROM new_table n + """) + print(f"created {table_name}") diff --git a/src/data/main.py b/src/data/main.py index e59878c..eccf43d 100644 --- a/src/data/main.py +++ b/src/data/main.py @@ -2,6 +2,10 @@ import os from pathlib import Path import duckdb from enum import Enum +from urllib.parse import urlparse +from tld import get_tld +from tld.utils import update_tld_names +import sys class Data(str, Enum): Titles = 'titles' @@ -9,6 +13,16 @@ class Data(str, Enum): def data_dir(): return Path(os.environ['DATA_MINING_DATA_DIR']) +def paths(name='app'): + if 'app' in name: + return Path(os.environ['DATA_MINING_APP_DIR']) + if 'data' in name: + return Path(os.environ['DATA_MINING_DATA_DIR']) + if 'doc' in name: + return Path(os.environ['DATA_MINING_DOCS_DIR']) + if 'figure' in name: + return Path(os.environ['DATA_MINING_DOCS_DIR']) / 'figures' + def connect(): DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) # APP_DIR = Path(os.environ['DATA_MINING_APP_DIR']) @@ -28,3 +42,66 @@ def from_db(t: Data): limit 100 """).df() return table + +def map_tld(x): + try: + res = get_tld(x, as_object=True) + return res.fld + except: + print(f"'{x}' is not valid.", file=sys.stderr) + return None + +def ticklabels(): + return [ + 'Left', + 'Left-Center', + 'Least Biased', + 'Right-Center', + 'Right', + ] + +def bias_label_to_int(rating:str, source: str = 'mbfc') -> int: + if source == 'mbfc': + mapping = { + 'Left' : 0, + 'Left-Center' : 1, + 'Least Biased' : 2, + 'Right-Center' : 3, + 'Right' : 4, + } + else: + mapping = { + 'left' : 0, + 'left-center' : 1, + 'center' : 2, + 'right-center' : 3, + 'right' : 4, + } + try: + return mapping[rating] + except: + print(f"no mapping for {rating}", file=sys.stderr) + return -1 + +def bias_int_to_label(class_id: int, source: str = 'mbfc') -> str: + if source == 'mbfc': + mapping = { + 0 : 'Left', + 1 : 'Left-Center', + 2 : 'Least Biased', + 3 : 'Right-Center', + 4 : 'Right', + } + else: + mapping = { + 0 : 'left', + 1 : 'left-center', + 2 : 'center', + 3 : 'right-center', + 4 : 'right', + } + try: + return mapping[class_id] + except: + print(f"no mapping for {class_id}", file=sys.stderr) + return -1 diff --git a/src/data/scrape.py b/src/data/scrape.py index a0d07c1..36b2124 100644 --- a/src/data/scrape.py +++ b/src/data/scrape.py @@ -319,12 +319,6 @@ def another_norm(): """) - def map_tld(x): - try: - res = get_tld(x, as_object=True) - return res.fld - except: - return None DB.sql(""" SELECT diff --git a/src/data/selection.py b/src/data/selection.py new file mode 100644 index 0000000..8d20747 --- /dev/null +++ b/src/data/selection.py @@ -0,0 +1,47 @@ +from data.main import connect +import pandas as pd +import numpy as np + +def create_tables(): + + with connect() as db: + edges = db.query(""" + select + * + from link_edges + """).df() + + adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0) + select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id']) + + with connect() as db: + db.query("create schema top") + + db.query(""" + CREATE OR REPLACE TABLE top.publishers AS + SELECT + p.* + FROM publishers p + JOIN select_publishers s + ON s.publisher_id = p.id + """) + + db.query(""" + CREATE OR REPLACE TABLE top.stories AS + SELECT + s.* + FROM stories s + JOIN top.publishers p + ON s.publisher_id = p.id + WHERE year(s.published_at) >= 2006 + AND year(s.published_at) < 2023 + """) + + db.query(""" + CREATE OR REPLACE TABLE top.related_stories AS + SELECT + r.* + FROM top.stories s + JOIN related_stories r + ON s.id = r.parent_id + """) diff --git a/src/sentiment.py b/src/data/sentiment.py similarity index 78% rename from src/sentiment.py rename to src/data/sentiment.py index 454f9f6..e41c71c 100644 --- a/src/sentiment.py +++ b/src/data/sentiment.py @@ -1,10 +1,11 @@ +import click from transformers import DistilBertTokenizer, DistilBertForSequenceClassification import torch import torch.nn.functional as F -from data import connect, data_dir +from data.main import connect, paths import numpy as np from tqdm import tqdm -import click +import pandas as pd @click.option('-c', '--chunks', type=int, default=500, show_default=True) @click.command("sentiment:extract") @@ -67,20 +68,19 @@ def extract(chunks): @click.command('sentiment:load') def load(): - DB = connect() - sentiments = np.load(data_dir() / 'sentiment.npy') - story_ids = np.load(data_dir() / 'sentiment_ids.npy') + sentiments = np.load(paths('data') / 'sentiment.npy') + story_ids = np.load(paths('data') / 'sentiment_ids.npy') data = pd.DataFrame(story_ids, columns=['story_id']).reset_index() data['sentiment_id'] = sentiments - DB.query(""" - CREATE OR REPLACE TABLE top.story_sentiments AS - SELECT - data.story_id - ,data.sentiment_id as class_id - ,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label - FROM data - JOIN top.stories s - ON s.id = data.story_id - """) - DB.close() + with connect() as db: + db.query(""" + CREATE OR REPLACE TABLE story_sentiments AS + SELECT + data.story_id + ,data.sentiment_id as class_id + ,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label + FROM data + JOIN stories s + ON s.id = data.story_id + """) diff --git a/src/links.py b/src/links.py deleted file mode 100644 index 9dc2a56..0000000 --- a/src/links.py +++ /dev/null @@ -1,255 +0,0 @@ -import click -from data.main import connect -import pandas as pd -import numpy as np -import seaborn as sns -import matplotlib.pyplot as plt - - -@click.command('links:create-table') -def create_table(): - - table_name = "top.link_edges" - DB = connect() - DB.query(f""" - CREATE OR REPLACE TABLE {table_name} AS - with cte as( - SELECT - s.publisher_id as parent_id - ,r.publisher_id as child_id - ,count(1) as links - FROM top.stories s - JOIN top.related_stories r - ON s.id = r.parent_id - group by - s.publisher_id - ,r.publisher_id - ) - SELECT - cte.parent_id - ,cte.child_id - ,cte.links as links - ,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized - ,case when cte.links > 0 then 1 else 0 end as onehot - FROM cte - WHERE cte.child_id in ( - SELECT - distinct parent_id - FROM cte - ) - AND cte.parent_id in ( - SELECT - distinct child_id - FROM cte - ) - """) - DB.close() - - DB = connect() - DB.query(""" - SELECT - * - ,-log10(links) - --distinct parent_id - FROM top.link_edges e - WHERE e.parent_id = 238 - """) - DB.close() - print(f"created {table_name}") - -@click.command('links:create-pca') -@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links') -def create_pca(source): - """create 2D pca labels""" - - from sklearn.decomposition import PCA - - table_name = f"top.publisher_pca_{source}" - DB = connect() - pub = DB.query(""" - SELECT - * - FROM top.publishers - """).df() - df = DB.query(f""" - SELECT - parent_id - ,child_id - ,{source} as links - FROM top.link_edges - """).df() - DB.close() - pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0) - - svd = PCA(n_components=2) - svd_out = svd.fit_transform(pivot) - - out = pivot.reset_index()[['parent_id']] - out['first'] = svd_out[:, 0] - out['second'] = svd_out[:, 1] - out = pd.merge(out, pub, left_on='parent_id', right_on='id') - - DB = connect() - DB.query(f""" - CREATE OR REPLACE TABLE {table_name} AS - SELECT - out.id as publisher_id - ,out.first as first - ,out.second as second - FROM out - """) - DB.close() - print(f"created {table_name}") - - -@click.command('links:create-clusters') -@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links') -def create_clusters(source): - from sklearn.cluster import KMeans - - table_name = f"top.publisher_clusters_{source}" - DB = connect() - df = DB.query(f""" - SELECT - parent_id - ,child_id - ,{source} as links - FROM top.link_edges - """).df() - pub = DB.query(""" - SELECT - * - FROM top.publishers - """).df() - DB.close() - pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0) - - - k = 8 - kmeans = KMeans(n_clusters=k, n_init="auto") - pred = kmeans.fit_predict(pivot) - out = pivot.reset_index()[['parent_id']] - out['label'] = pred - out = pd.merge(out, pub, left_on='parent_id', right_on='id') - new_table = out[['id', 'label']] - - DB = connect() - DB.query(f""" - CREATE OR REPLACE TABLE {table_name} AS - SELECT - n.id as publisher_id - ,n.label as label - FROM new_table n - """) - DB.close() - print(f"created {table_name}") - -def to_matrix(): - """returns an adjacency matrix of publishers to publisher link frequency""" - - DB = connect() - - bias_map = pd.DataFrame([ - {'label' :'left', 'value' : 0}, - {'label' :'left-center', 'value' : 1}, - {'label' :'center', 'value' : 2}, - {'label' :'right-center', 'value' : 3}, - {'label' :'right', 'value' : 4}, - {'label' :'allsides', 'value' : -1}, - ]) - - bias = DB.sql(""" - SELECT - b.id - ,b.label - ,m.value - FROM publisher_bias b - JOIN bias_map m - ON b.label = m.label - WHERE value != -1 - """).df() - - pub = DB.sql(""" - select - p.id - ,p.name - ,p.url - from publishers p - """).df() - - edges = DB.sql(""" - WITH total as ( - SELECT - s.publisher_id as id - ,COUNT(1) as stories - FROM stories s - GROUP BY - s.publisher_id - ), p as ( - SELECT - p.id - ,stories - FROM publishers p - LEFT JOIN total t - ON t.id = p.id - WHERE t.stories >= 20 - ), cte as ( - SELECT - r.publisher_id as child_id - ,s.publisher_id as parent_id - ,count(1) as links - FROM related_stories r - JOIN stories s - ON s.id = r.parent_id - group by - s.publisher_id - ,r.publisher_id - ) - SELECT - p.id as parent_id - ,cte.child_id - ,links - FROM p - left JOIN cte - ON p.id = cte.parent_id - """).df() - - # only keep values that have more than 1 link - test = edges[edges['links'] > 2].pivot(index='parent_id', columns='child_id', values='links').fillna(0).reset_index() - edges.dropna().pivot(index='parent_id', columns='child_id', values='links').fillna(0) - pd.merge(adj, pub, how='left', left_on='parent_id', right_on='id') - adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0) - adj.values.shape - - - out = pd.DataFrame(adj.index.values, columns=['id']) - out = pd.merge(out, pub, how='left', on='id') - return out - -@click.command('links:analysis') -def analysis(): - from sklearn.decomposition import PCA, TruncatedSVD - from sklearn.cluster import MiniBatchKMeans - adj = to_matrix() - pca = PCA(n_components=4) - pca_out = pca.fit_transform(adj) - - svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42) - svd_out = svd.fit_transform(adj) - - x = svd_out[:, 0] - y = svd_out[:, 1] - - x = pca_out[:, 0] - y = pca_out[:, 1] - sns.scatterplot(x=x, y=y) - plt.show() - - kmeans = MiniBatchKMeans(n_clusters=5, random_state=0, batch_size=6, n_init="auto") - pred = kmeans.fit_predict(pca_out) - - sns.scatterplot(x=x, y=y, hue=pred) - plt.show() - - sns.scatterplot(x=pub['first'], y=pub['second'], hue=pub['bias']) - plt.show() diff --git a/src/mine.py b/src/mine.py index 8c2108b..5550de0 100644 --- a/src/mine.py +++ b/src/mine.py @@ -1,6 +1,5 @@ -from data.main import data_dir, connect +from data.main import connect, paths import numpy as np -import sklearn from sklearn.cluster import MiniBatchKMeans import click from pathlib import Path @@ -11,7 +10,7 @@ from enum import Enum, auto @click.command(name="mine:embeddings") def embeddings(): - data = np.load(data_dir() / "embeddings.npy") + data = np.load(paths('data') / "embeddings.npy") kmeans = MiniBatchKMeans(n_clusters=5, random_state=0, batch_size=6, @@ -76,7 +75,7 @@ class PlotName(str, Enum): @click.option('-n', '--name', required=True, type=click.Choice(PlotName)) @click.option('-o', '--output', required=False, type=click.Path()) def plot(name: PlotName, output: Path): - output = output if output else APP_DIR / f'docs/{name}.png' + output = output if output else paths('figures') / f'{name}.png' if name == PlotName.TitleLength: fig, ax = plt.subplots(1,1) data = db.sql(""" diff --git a/src/mining/bias.py b/src/mining/bias.py new file mode 100644 index 0000000..69dad84 --- /dev/null +++ b/src/mining/bias.py @@ -0,0 +1,36 @@ +from data.main import connect, map_tld +import os +from pathlib import Path + +def normalize(): + with connect() as db: + db.sql(""" + SELECT + p.name + ,count(1) as ctn + ,sum(ctn) over() as all + FROM mbfc.publishers p + JOIN mbfc.publisher_stories ps + ON ps.publisher_id = p.id + JOIN stories s + ON s.id = ps.story_id + GROUP BY + p.name + """) + + with connect() as db: + db.sql(""" + SELECT + bias + ,count(distinct p.id) as publishers + ,count(1) as stories + ,count(1) / count(distinct p.id) as ratio + FROM mbfc.publishers p + JOIN mbfc.publisher_stories ps + ON ps.publisher_id = p.id + JOIN stories s + ON s.id = ps.story_id + GROUP BY + p.bias + ORDER BY count(1) + """) diff --git a/src/plots/__init__.py b/src/plots/__init__.py index ccdc23c..0e3212f 100644 --- a/src/plots/__init__.py +++ b/src/plots/__init__.py @@ -1,9 +1,13 @@ import plots.sentence import plots.emotion import plots.sentiment +import plots.links +import plots.classifier __all__ = [ 'sentence' 'emotion', 'sentiment', + 'links', + 'classifier', ] diff --git a/src/plots/bias.py b/src/plots/bias.py index 5c3f79c..d528fd0 100644 --- a/src/plots/bias.py +++ b/src/plots/bias.py @@ -1,5 +1,5 @@ import click -from data.main import connect +from data.main import connect, bias_label_to_int, ticklabels import os from pathlib import Path import seaborn as sns @@ -7,54 +7,53 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures' - @click.command('plot:bias-hist') def hist(): - filename = "bias_hist.png" + save_to = paths('figures') / "bias_hist.png" + + with connect() as db: + data = db.sql(""" + SELECT + p.ordinal + ,count(1) as stories + FROM stories s + JOIN mbfc.publisher_stories ps + ON s.id = ps.story_id + JOIN mbfc.publishers p + ON ps.publisher_id = p.id + WHERE ordinal != -1 + GROUP BY + p.ordinal + """).df() - DB = connect() - data = DB.sql(""" - SELECT - b.ordinal - ,count(1) as stories - FROM stories s - JOIN publisher_bias pb - ON pb.publisher_id = s.publisher_id - JOIN bias_ratings b - ON b.id = pb.bias_id - GROUP BY - b.ordinal - """).df() - DB.close() ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue') - ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] - ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels) + ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels()) plt.tight_layout() - plt.savefig(out_path / filename) - print(f"saved: {filename}") + plt.savefig(save_to) + plt.close() + print(f"saved: {save_to}") + @click.command('plot:bias-publisher-hist') def publisher_hist(): - filename = "bias_publisher_hist.png" + save_to = paths('figures') / "bias_publisher_hist.png" - DB = connect() - data = DB.sql(""" - SELECT - b.ordinal - ,count(1) as publishers - FROM publisher_bias pb - JOIN bias_ratings b - ON b.id = pb.bias_id - GROUP BY - b.ordinal - """).df() - DB.close() + with connect() as db: + data = db.sql(""" + SELECT + p.ordinal + ,count(distinct p.id) as publishers + FROM mbfc.publishers p + JOIN mbfc.publisher_stories ps + ON ps.publisher_id = p.id + WHERE ordinal != -1 + GROUP BY + p.ordinal + """).df() ax = sns.barplot(x=data['ordinal'], y=data['publishers'], color='tab:blue') - ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] - ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels) + ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels()) plt.tight_layout() - plt.savefig(out_path / filename) + plt.savefig(save_to) plt.close() - print(f"saved: {filename}") + print(f"saved: {save_to}") diff --git a/src/plots/classifier.py b/src/plots/classifier.py index c85aa7d..bf492a4 100644 --- a/src/plots/classifier.py +++ b/src/plots/classifier.py @@ -5,30 +5,32 @@ import seaborn as sns import matplotlib.pyplot as plt from pathlib import Path -out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures' - @click.command('plot:pca-with-classes') -def pca_with_classes(): - filename = "pca_with_classes.png" +@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links') +def pca_with_classes(source): - DB = connect() - data = DB.query(f""" - SELECT - p.tld - ,b.bias - ,c.first - ,c.second - ,round(cast(b.agree as float) / (b.agree + b.disagree), 2) ratio - FROM top.publishers p - JOIN top.publisher_bias pb - ON p.id = pb.publisher_id - JOIN bias_ratings b - ON b.id = pb.bias_id - JOIN top.publisher_pca_normalized c - ON c.publisher_id = p.id - """).df() - DB.close() - ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['bias'], s=100) - ax.set(title="pca components vs. bias labels", xlabel="first pca component", ylabel="second pca component") - plt.savefig(out_dir / filename) - print(f"saved: {filename}") + save_to = paths('figures') / f"link_{source}_pca_with_classes.png" + + with connect() as db: + df = db.query(f""" + SELECT + p.tld + ,p.bias + ,c.first + ,c.second + --,round(cast(b.agree as float) / (b.agree + b.disagree), 2) ratio + FROM mbfc.publishers p + JOIN publisher_pca_{source} c + ON c.publisher_id = p.id + WHERE p.ordinal != -1 + ORDER BY p.ordinal + """).df() + + ax = sns.relplot(df, x='first', y='second', hue='bias', col='bias', s=100, palette='rainbow') + ax.set(xlabel="first pca component", + ylabel="second pca component") + ax.figure.suptitle="pca components vs. bias labels" + plt.savefig(save_to) + plt.close() + print(f"saved: {save_to}") + os.system(f'xdg-open {save_to}') diff --git a/src/plots/descriptive.py b/src/plots/descriptive.py index 24cf25b..5bde708 100644 --- a/src/plots/descriptive.py +++ b/src/plots/descriptive.py @@ -1,169 +1,190 @@ import click -from data.main import connect +from data.main import connect, paths import os import seaborn as sns import matplotlib.pyplot as plt from pathlib import Path import numpy as np -out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures' - @click.command('plot:articles-per-year') def articles_per_year(): - filename = 'articles_per_year.png' + save_to = paths('figures') / 'articles_per_year.png' - DB = connect() - data = DB.query(""" - select - year(published_at) as year - ,count(1) as stories - from stories - group by - year(published_at) - """).df() - DB.close() + with connect() as db: + data = DB.query(""" + select + year(published_at) as year + ,count(1) as stories + from stories + group by + year(published_at) + """).df() ax = sns.barplot(x=data.year, y=data.stories, color='tab:blue') ax.tick_params(axis='x', rotation=90) ax.set(title="count of articles per year", ylabel="count of stories (#)") plt.tight_layout() - plt.savefig(out_dir / filename) + plt.savefig(save_to) + print(f"saved: {save_to}") @click.command('plot:distinct-publishers') def distinct_publishers(): - filename = 'distinct_publishers.png' + save_to = paths('figures') / 'distinct_publishers.png' - DB = connect() - data = DB.query(""" - select - year(published_at) as year - ,count(distinct publisher_id) as publishers - from stories - group by - year(published_at) - """).df() - DB.close() + with connect() as db: + data = DB.query(""" + select + year(published_at) as year + ,count(distinct publisher_id) as publishers + from stories + group by + year(published_at) + """).df() ax = sns.barplot(x=data.year, y=data.publishers, color='tab:blue') ax.tick_params(axis='x', rotation=90) ax.set(title="count of publishers per year", ylabel="count of publishers (#)") plt.tight_layout() - plt.savefig(out_dir / filename) + plt.savefig(save_to) plt.close() + print(f"saved: {save_to}") @click.command('plot:stories-per-publisher') def stories_per_publisher(): - filename = 'stories_per_publisher.png' + save_to = paths('figures') / 'stories_per_publisher.png' - DB = connect() - data = DB.query(""" - with cte as ( - select - publisher_id - ,year(published_at) as year - ,count(1) as stories - from stories - group by - publisher_id - ,year(published_at) - ) , agg as ( + with connect() as db: + data = db.query(""" + with cte as ( select - publisher_id - ,avg(stories) as stories_per_year - ,case - when avg(stories) < 2 then 2 - when avg(stories) < 4 then 4 - when avg(stories) < 8 then 8 - when avg(stories) < 16 then 16 - when avg(stories) < 32 then 32 - when avg(stories) < 64 then 64 - when avg(stories) < 128 then 128 - else 129 - end as max_avg - from cte + ps.publisher_id + ,year(s.published_at) as year + ,count(1) as stories + from stories s + join mbfc.publisher_stories ps + on ps.story_id = s.id group by - publisher_id - ) - select - max_avg - ,count(1) as publishers - from agg - group by - max_avg - """).df() - DB.close() + ps.publisher_id + ,year(s.published_at) + ) , agg as ( + select + publisher_id + ,avg(stories) as stories_per_year + ,case + when avg(stories) < 2 then 2 + when avg(stories) < 4 then 4 + when avg(stories) < 8 then 8 + when avg(stories) < 16 then 16 + when avg(stories) < 32 then 32 + when avg(stories) < 64 then 64 + when avg(stories) < 128 then 128 + else 129 + end as max_avg + from cte + group by + publisher_id + ) + select + max_avg + ,count(1) as publishers + from agg + group by + max_avg + """).df() ax = sns.barplot(x=data.max_avg, y=data.publishers, color='tab:blue') - ax.set(title="histogram of publisher stories per year", ylabel="count of publishers (#)", xlabel="max average stories / year") + ax.set(title="histogram of publisher stories per year", ylabel="count of publishers (#)", xlabel="avg. stories / year", xticklabels=['2', '4', '8', '16', '32', '64', '128', '>128']) plt.tight_layout() - plt.savefig(out_dir / filename) + plt.savefig(save_to) plt.close() + print(f"saved: {save_to}") @click.command('plot:top-publishers') def top_publishers(): """plot top publishers over time""" - filename = 'top_publishers.png' + save_to = paths('figures') / 'top_publishers.png' - DB = connect() - data = DB.query(""" - select - p.tld - ,year(published_at) as year - ,count(1) as stories - from ( - select + with connect() as db: + db.query(""" + SELECT + p.tld + ,p.id + FROM mbfc.publishers p + JOIN mbfc.publisher_stories ps + ON ps.publisher_id = p.id + JOIN stories s + ON s.id = ps.story_id + GROUP BY + p.tld + ,p.id + order by count(1) desc + limit 20 + """) + + with connect() as db: + data = db.query(""" + WITH p as ( + SELECT + p.tld + ,p.id + FROM mbfc.publishers p + JOIN mbfc.publisher_stories ps + ON ps.publisher_id = p.id + JOIN stories s + ON s.id = ps.story_id + GROUP BY + p.tld + ,p.id + order by count(1) desc + limit 20 + ) + SELECT p.tld - ,p.id - from top.publishers p - join top.stories s - on s.publisher_id = p.id - group by + ,YEAR(s.published_at) AS year + ,COUNT(1) AS stories + FROM stories s + JOIN mbfc.publisher_stories ps + ON ps.story_id = s.id + JOIN p + ON p.id = ps.publisher_id + GROUP by p.tld - ,p.id - order by count(1) desc - limit 20 - ) p - join top.stories s - on s.publisher_id = p.id - group by - p.tld - ,year(published_at) - order by count(distinct s.id) desc - """).df() - DB.close() + ,YEAR(published_at) + ORDER BY year, COUNT(DISTINCT s.id) DESC + """).df() pivot = data.pivot(columns='year', index='tld', values='stories') ax = sns.heatmap(pivot, cmap="crest") ax.set(title="top 20 publishers (by tld)", ylabel="tld", xlabel="stories / year (#)") plt.tight_layout() - plt.savefig(out_dir / filename) + plt.savefig(save_to) plt.close() + print(f"saved: {save_to}") @click.command('plot:common_tld') def common_tld(): import dataframe_image as dfi - filename = 'common_tld.png' + save_to = paths('figures') / 'common_tld.png' - DB = connect() - data = DB.query(""" - select - split_part(url, '.', -1) as tld - ,count(1) as publishers - ,case when count(1) < 20 - then string_agg(distinct url, '\t') - else NULL - end as urls - from publishers - group by - split_part(url, '.', -1) - order by - count(1) desc - """).df() - DB.close() - data[:15][['tld', 'publishers']].style.hide(axis="index").export_png(out_dir / filename, table_conversion='matplotlib') + with connect() as db: + data = db.query(""" + select + split_part(url, '.', -1) as tld + ,count(1) as publishers + ,case when count(1) < 20 + then string_agg(distinct url, '\t') + else NULL + end as urls + from publishers + group by + split_part(url, '.', -1) + order by + count(1) desc + """).df() + data[:15][['tld', 'publishers']].style.hide(axis="index").export_png(save_to, table_conversion='matplotlib') def stats(): @@ -246,7 +267,7 @@ def stats(): @click.command('plot:bias-stats') def bias_stats(): import dataframe_image as dfi - filename = 'bias_stats.png' + save_to = paths('figures') / 'bias_stats.png' DB = connect() @@ -300,3 +321,69 @@ def bias_stats(): """).df() DB.close() print(df.to_markdown(index=False)) + +@click.command('plot:bias-over-time') +def bias_over_time(): + """plot bias labels over time""" + + save_to = paths('figures') / 'bias_over_time.png' + + with connect() as db: + df = db.sql(""" + SELECT + p.bias + ,p.id + ,date_trunc('year', s.published_at) as year + ,count(1) as stories + FROM stories s + JOIN mbfc.publisher_stories ps + ON ps.story_id = s.id + JOIN mbfc.publishers p + ON p.id = ps.publisher_id + where year(s.published_at) not in (2006, 2023) + and p.ordinal != -1 + GROUP BY + p.bias + ,p.id + ,p.ordinal + ,date_trunc('year', s.published_at) + order by + p.ordinal + ,date_trunc('year', s.published_at) + """).df() + + ax = sns.relplot(df, kind='line', x='year', y='stories', col='bias', units='id', estimator=None, palette='rainbow') + ax.set(ylabel="stories", xlabel="year") + plt.tight_layout() + plt.savefig(save_to) + plt.close() + print(f"saved: {save_to}") + +def bias_missing(): + + with connect() as db: + df = db.sql(""" + SELECT + date_trunc('year', s.published_at) as year + ,s.tld + ,count(1) as stories + FROM stories s + LEFT JOIN mbfc.publisher_stories ps + ON ps.story_id = s.id + WHERE ps.publisher_id is NULL + AND year(s.published_at) not in (2006, 2023) + GROUP BY + s.tld + ,date_trunc('year', s.published_at) + HAVING count(1) > 10 + ORDER BY + date_trunc('year', s.published_at) + """).df() + + ax = sns.lineplot(df, x='year', y='stories', units='tld', estimator=None) + ax.set(ylabel="stories", xlabel="year") + plt.tight_layout() + plt.show() + #plt.savefig(save_to) + plt.close() + #print(f"saved: {save_to}") diff --git a/src/plots/emotion.py b/src/plots/emotion.py index 11666f1..f025afa 100644 --- a/src/plots/emotion.py +++ b/src/plots/emotion.py @@ -1,77 +1,79 @@ import click -from data.main import connect +from data.main import connect, paths, ticklabels import os -from pathlib import Path import seaborn as sns import matplotlib.pyplot as plt import numpy as np import pandas as pd -out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures' - @click.command('plot:emotion-over-time') def emotion_over_time(): - filename = "emotion_over_time.png" - DB = connect() - emotions = DB.sql(""" - SELECT - date_trunc('year', s.published_at) AS year - ,e.label AS emotion - ,count(1) AS stories - FROM top.stories s - JOIN story_emotions se - ON s.id = se.story_id - JOIN emotions e - ON e.id = se.emotion_id - GROUP by - date_trunc('year', s.published_at) - ,e.label - """).df() - DB.close() + filename = "emotion_over_time.png" + save_to = paths('figures') / filename + + with connect() as db: + emotions = db.sql(""" + SELECT + date_trunc('year', s.published_at) AS year + ,e.label AS emotion + ,count(1) AS stories + FROM stories s + JOIN story_emotions se + ON s.id = se.story_id + JOIN emotions e + ON e.id = se.emotion_id + GROUP by + date_trunc('year', s.published_at) + ,e.label + """).df() ax = sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion']) ax.set(title="title emotions over years", xlabel="year", ylabel="stories (#)") - plt.savefig(out_path / filename) - print(f"saved: {filename}") + plt.savefig(save_to) + plt.close() + print(f"saved: {save_to}") + os.system(f'xdg-open {save_to}') @click.command('plot:emotion-regression') def emotion_regression(): + """plot emotion over time as regression""" + from sklearn import linear_model from sklearn.model_selection import train_test_split from sklearn.metrics import ConfusionMatrixDisplay filename = "emotion_regression.png" + save_to = paths('figures') / filename - DB = connect() - emotions = DB.query(""" - SELECT - label - FROM emotions e - """).df()['label'].to_list() - DB.close() - - DB = connect() - df = DB.sql(f""" - SELECT - epoch(date_trunc('yearweek', s.published_at)) AS date - ,e.id AS emotion_id - ,p.id as publisher_id - ,count(1) AS stories - FROM top.stories s - JOIN top.publishers p - ON p.id = s.publisher_id - JOIN story_emotions se - ON s.id = se.story_id - JOIN emotions e - ON e.id = se.emotion_id - GROUP by - epoch(date_trunc('yearweek', s.published_at)) - ,p.id - ,e.id - """).df() - DB.close() + with connect() as db: + #emotions = db.query(""" + # SELECT + # label + # FROM emotions e + #""").df()['label'].to_list() + df = db.sql(f""" + SELECT + epoch(date_trunc('yearweek', s.published_at)) AS date + ,e.id AS emotion_id + ,p.id as publisher_id + ,count(1) AS stories + FROM stories s + JOIN mbfc.publisher_stories ps + ON ps.story_id = s.id + JOIN mbfc.publishers p + ON p.id = ps.publisher_id + JOIN story_emotions se + ON s.id = se.story_id + JOIN emotions e + ON e.id = se.emotion_id + WHERE p.ordinal != -1 + GROUP by + epoch(date_trunc('yearweek', s.published_at)) + ,p.id + ,e.id + """).df() results = [] for (emotion_id, publisher_id), group in df.groupby(['emotion_id', 'publisher_id']): @@ -83,77 +85,59 @@ def emotion_regression(): results.append({'emotion_id' : emotion_id, 'publisher_id':publisher_id, 'per_year' : per_year}) results = pd.DataFrame(results) - DB = connect() - out = DB.query(""" - SELECT - e.label as emotion - --,p.tld - ,avg(results.per_year) as avg_reg_coef - ,b.ordinal - FROM results - JOIN emotions e - ON e.id = results.emotion_id - JOIN top.publishers p - ON p.id = results.publisher_id - JOIN publisher_bias pb - ON pb.publisher_id = results.publisher_id - JOIN bias_ratings b - ON b.id = pb.bias_id - GROUP BY - e.label - ,b.ordinal - """).df() - DB.close() - pivot = out.pivot(index=['emotion'], columns=['ordinal'], values=['avg_reg_coef']) + with connect() as db: + out = db.query(""" + SELECT + e.label as emotion + ,avg(results.per_year) as avg_reg_coef + ,p.bias + FROM results + JOIN emotions e + ON e.id = results.emotion_id + JOIN mbfc.publishers p + ON p.id = results.publisher_id + GROUP BY + e.label + ,p.bias + """).df() - ax = sns.heatmap(pivot, cmap='RdBu_r') - ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] + pivot = out.pivot(index=['emotion'], columns=['bias'], values=['avg_reg_coef']) + + ax = sns.heatmap(pivot, cmap='BrBG', vmin=-0.01, vmax=0.01, center=0) + #ax = sns.heatmap(pivot, cmap='RdBu_r', center=0) ax.set(title="slope of regression (stories/year) by bias and emotion" - ,xticklabels=ticklabels + ,xticklabels=ticklabels() ,xlabel="bias" ,ylabel="emotion") plt.tight_layout() - plt.savefig(out_path / filename) - print(f"saved: {filename}") + plt.savefig(save_to) + plt.close() + print(f"saved: {save_to}") @click.command('plot:emotion-hist') def emotion_hist(): + filename = "emotion_hist.png" + save_to = paths('figures') / filename - DB = connect() - DB.query("""describe story_emotions""") + with connect() as db: + data = db.sql(""" + SELECT + p.bias + ,count(1) as stories + FROM stories s + JOIN mbfc.publisher_stories ps + ON ps.story_id = s.id + JOIN mbfc.publishers p + ON p.id = ps.publisher_id + WHERE p.ordinal != -1 + GROUP BY + p.bias + """).df() - DB.query(""" - select - e.label - ,count(distinct s.id) as stories - ,count(distinct s.publisher_id) as publishers - from story_emotions se - join emotions e - on e.id = se.emotion_id - join top.stories s - on s.id = se.story_id - group by - e.label - """).df().to_markdown(index=False) - - data = DB.sql(""" - SELECT - b.ordinal - ,count(1) as stories - FROM stories s - JOIN publisher_bias pb - ON pb.publisher_id = s.publisher_id - JOIN bias_ratings b - ON b.id = pb.bias_id - GROUP BY - b.ordinal - """).df() - DB.close() - - ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue') - ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] - ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels) + ax = sns.barplot(data, x='bias', y='stories', palette='rainbow', order=ticklabels()) + ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels()) plt.tight_layout() - plt.savefig(out_path / filename) - print(f"saved: {filename}") + plt.savefig(save_to) + plt.close() + print(f"saved: {save_to}") diff --git a/src/plots/links.py b/src/plots/links.py index a5d1ada..19985b4 100644 --- a/src/plots/links.py +++ b/src/plots/links.py @@ -9,20 +9,20 @@ import numpy as np from sklearn.metrics import silhouette_score import pandas as pd -out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures' @click.command('plot:link-elbow') def elbow(): from sklearn.cluster import KMeans - filename = 'link_cluster_elbow.png' + save_to = paths('figures') / 'link_cluster_elbow.png' + + with connect() as db: + df = db.query(""" + SELECT + * + FROM link_edges + """).df() - DB = connect() - df = DB.query(""" - SELECT - * - FROM link_edges - """).df() pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0) to_plot = [] @@ -36,8 +36,9 @@ def elbow(): ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia) ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points") - plt.savefig(out_dir / filename) + plt.savefig(save_to) plt.close() + print(f"saved plot: {save_to}") # randomly pick 8 @@ -45,72 +46,65 @@ def elbow(): @click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links') def link_pca_clusters(source): - filename = f"link_pca_clusters_{source}.png" + save_to = paths('figures') / f"link_pca_clusters_{source}.png" - DB = connect() - df = DB.query(f""" - SELECT - c.label as cluster - ,p.tld - --,b.label as bias - ,pca.first - ,pca.second - ,s.cnt as stories - FROM top.publisher_clusters_{source} c - JOIN top.publishers p - ON c.publisher_id = p.id - JOIN - ( - select - s.publisher_id - ,count(1) as cnt - FROM top.stories s - GROUP BY - s.publisher_id - ) s - ON s.publisher_id = p.id - JOIN top.publisher_pca_{source} pca - ON pca.publisher_id = p.id - """).df() - DB.close() + with connect() as db: + df = db.query(f""" + SELECT + c.label as cluster + ,p.tld + --,b.label as bias + ,pca.first + ,pca.second + ,s.cnt as stories + FROM top.publisher_clusters_{source} c + JOIN top.publishers p + ON c.publisher_id = p.id + JOIN + ( + select + s.publisher_id + ,count(1) as cnt + FROM top.stories s + GROUP BY + s.publisher_id + ) s + ON s.publisher_id = p.id + JOIN top.publisher_pca_{source} pca + ON pca.publisher_id = p.id + """).df() ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster']) ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component") - plt.savefig(out_dir / filename) - - # .df().groupby(['cluster', 'bias']).describe() - - + plt.savefig(save_to) + print(f"saved plot: {save_to}") def test(): - data_dir = Path(os.getenv('DATA_MINING_DATA_DIR')) - DB.query(""" - SELECT - p.id as publisher_id - ,p.name - ,p.tld - ,cast(b.bias_id as int) as bias_id - ,count(1) as stories - FROM publishers p - JOIN stories s - ON s.publisher_id = p.id - JOIN publisher_clusters c - ON c.publisher_id = p.id - LEFT JOIN publisher_bias b - ON b.publisher_id = p.id - where bias_id is null - group by - p.id - ,p.name - ,p.tld - ,b.bias_id - ORDER BY count(1) desc - """) - - # .df().to_csv(data_dir / 'cluster_publishers.csv', sep="|", index=False) - DB.close() + with connect() as db: + db.query(""" + SELECT + p.id as publisher_id + ,p.name + ,p.tld + ,cast(b.bias_id as int) as bias_id + ,count(1) as stories + FROM publishers p + JOIN stories s + ON s.publisher_id = p.id + JOIN publisher_clusters c + ON c.publisher_id = p.id + LEFT JOIN publisher_bias b + ON b.publisher_id = p.id + where bias_id is null + group by + p.id + ,p.name + ,p.tld + ,b.bias_id + ORDER BY count(1) desc + """) @click.command('plot:link-confusion') @@ -120,34 +114,36 @@ def link_confusion(): from sklearn.metrics import ConfusionMatrixDisplay filename = "link_confusion.png" + save_to = paths('figures') / filename - DB = connect() - bias = DB.query(""" - SELECT - p.id as publisher_id - ,b.ordinal - FROM top.publishers p - JOIN top.publisher_bias pb - ON pb.publisher_id = p.id - JOIN bias_ratings b - ON b.id = pb.bias_id - """).df() + with connect() as db: + bias = db.query(""" + SELECT + p.id as publisher_id + ,b.ordinal + FROM top.publishers p + JOIN top.publisher_bias pb + ON pb.publisher_id = p.id + JOIN bias_ratings b + ON b.id = pb.bias_id + """).df() + + df = db.query(""" + SELECT + * + FROM top.link_edges + WHERE parent_id in ( + select + publisher_id + from bias + ) + AND child_id in ( + select + publisher_id + from bias + ) + """).df() - df = DB.query(""" - SELECT - * - FROM top.link_edges - WHERE parent_id in ( - select - publisher_id - from bias - ) - AND child_id in ( - select - publisher_id - from bias - ) - """).df() pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0) x = pivot.values @@ -166,9 +162,9 @@ def link_confusion(): ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax) ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels) - plt.savefig(out_dir / filename) + plt.savefig(save_to) plt.close() - print(f"saved plot: {filename}") + print(f"saved plot: {save_to}") @click.command('plot:link-classifier') def link_confusion(): @@ -176,49 +172,51 @@ def link_confusion(): from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import ConfusionMatrixDisplay - filename = "link_confusion.png" + save_to = paths('figures') / "link_confusion.png" - DB = connect() - bias = DB.query(""" - SELECT - p.id as publisher_id - ,b.ordinal - FROM top.publishers p - JOIN top.publisher_bias pb - ON pb.publisher_id = p.id - JOIN bias_ratings b - ON b.id = pb.bias_id - """).df() + with connect() as db: + bias = db.query(""" + SELECT + p.id as publisher_id + ,b.ordinal + FROM top.publishers p + JOIN top.publisher_bias pb + ON pb.publisher_id = p.id + JOIN bias_ratings b + ON b.id = pb.bias_id + """).df() + + df = db.query(""" + SELECT + * + FROM top.link_edges + WHERE parent_id in ( + select + publisher_id + from bias + ) + AND child_id in ( + select + publisher_id + from bias + ) + """).df() - df = DB.query(""" - SELECT - * - FROM top.link_edges - WHERE parent_id in ( - select - publisher_id - from bias - ) - AND child_id in ( - select - publisher_id - from bias - ) - """).df() pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0) x = pivot.values y = bias.sort_values('publisher_id').ordinal - data = DB.query(f""" - SELECT - p.id as publisher_id - ,pca.first - ,pca.second - FROM top.publisher_pca_onehot pca - JOIN top.publishers p - ON pca.publisher_id = p.id - """).df() + with connect() as db: + data = db.query(f""" + SELECT + p.id as publisher_id + ,pca.first + ,pca.second + FROM top.publisher_pca_onehot pca + JOIN top.publishers p + ON pca.publisher_id = p.id + """).df() @@ -235,11 +233,11 @@ def link_confusion(): ConfusionMatrixDisplay.from_predictions(data['ordinal'], data['pred'], ax=ax) ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] ax.set(title="confusion matrix for link matrix kNN classifier", xticklabels=ticklabels, yticklabels=ticklabels) - plt.savefig(out_dir / filename) + plt.savefig(save_to) plt.close() - print(f"saved plot: {filename}") + print(f"saved plot: {save_to}") - ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred']) - plt.savefig(out_dir / filename) - plt.close() - print(f"saved plot: {filename}") + # ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred']) + # plt.savefig(out_dir / filename) + # plt.close() + # print(f"saved plot: {filename}") diff --git a/src/plots/sentence.py b/src/plots/sentence.py index b94e26e..d783f2b 100644 --- a/src/plots/sentence.py +++ b/src/plots/sentence.py @@ -1,5 +1,5 @@ import click -from data.main import connect +from data.main import connect, paths import os from pathlib import Path import seaborn as sns @@ -7,57 +7,52 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures' -data_path = Path(os.getenv('DATA_MINING_DATA_DIR')) - @click.command('plot:sentence-pca') def sentence_pca(): - filename = "embedding_sentence_pca.png" - DB = connect() + save_to = paths('figures') / "embedding_sentence_pca.png" - data = DB.query(""" - SELECT - pca.first - ,pca.second - ,b.bias as label - FROM top.story_embeddings_pca pca - JOIN top.stories s - ON s.id = pca.story_id - JOIN top.publisher_bias pb - ON pb.publisher_id = s.publisher_id - JOIN bias_ratings b - ON b.id = pb.bias_id - """).df() - DB.close() + with connect() as db: + data = db.query(""" + SELECT + pca.first + ,pca.second + ,b.bias as label + FROM top.story_embeddings_pca pca + JOIN top.stories s + ON s.id = pca.story_id + JOIN top.publisher_bias pb + ON pb.publisher_id = s.publisher_id + JOIN bias_ratings b + ON b.id = pb.bias_id + """).df() ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label']) ax.set(title="pca components vs. bias label", xlabel="first component", ylabel="second component") - plt.savefig(out_path / filename) + plt.savefig(save_to) @click.command('plot:avg-sentence-pca') def avg_sentence_pca(): - filename = "avg_embedding_sentence_pca.png" - DB = connect() + save_to = paths('figures') / "avg_embedding_sentence_pca.png" - data = DB.query(""" - SELECT - pca.first - ,pca.second - ,p.tld - ,b.bias as label - FROM top.publisher_embeddings_pca pca - JOIN top.publishers p - ON p.id = pca.publisher_id - JOIN top.publisher_bias pb - ON pb.publisher_id = p.id - JOIN bias_ratings b - ON b.id = pb.bias_id - """).df() - DB.close() + with connect() as db: + data = db.query(""" + SELECT + pca.first + ,pca.second + ,p.tld + ,b.bias as label + FROM top.publisher_embeddings_pca pca + JOIN top.publishers p + ON p.id = pca.publisher_id + JOIN top.publisher_bias pb + ON pb.publisher_id = p.id + JOIN bias_ratings b + ON b.id = pb.bias_id + """).df() ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label']) ax.set(title="avg. publisher embedding pca components vs. bias label", xlabel="first component", ylabel="second component") - plt.savefig(out_path / filename) + plt.savefig(save_to) @click.command('plot:sentence-confusion') def sentence_confusion(): @@ -65,32 +60,31 @@ def sentence_confusion(): from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import ConfusionMatrixDisplay - filename = "sentence_confusion.png" + save_to = paths('figures') / "sentence_confusion.png" - embeddings = np.load(data_path / 'embeddings.npy') - embedding_ids = np.load(data_path / 'embedding_ids.npy') + embeddings = np.load(paths('data') / 'embeddings.npy') + embedding_ids = np.load(paths('data') / 'embedding_ids.npy') ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index() - DB = connect() - data = DB.query(""" - SELECT - ids.index - ,s.id - ,b.ordinal - FROM ids - JOIN top.stories s - ON ids.story_id = s.id - JOIN top.publisher_bias pb - ON pb.publisher_id = s.publisher_id - JOIN bias_ratings b - ON b.id = pb.bias_id - """).df() - pub = DB.query(""" - SELECT - * - FROM top.publishers - """).df() - DB.close() + with connect() as db: + data = db.query(""" + SELECT + ids.index + ,s.id + ,b.ordinal + FROM ids + JOIN top.stories s + ON ids.story_id = s.id + JOIN top.publisher_bias pb + ON pb.publisher_id = s.publisher_id + JOIN bias_ratings b + ON b.id = pb.bias_id + """).df() + pub = db.query(""" + SELECT + * + FROM top.publishers + """).df() train, test = train_test_split(data) train_x, train_y = embeddings[train['index']], train['ordinal'] @@ -105,7 +99,7 @@ def sentence_confusion(): ConfusionMatrixDisplay.from_predictions(test_y, pred, ax=ax) ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels) - plt.savefig(out_path / filename) + plt.savefig(save_to) plt.close() - print(f"saved plot: {filename}") + print(f"saved plot: {save_to}") diff --git a/src/plots/sentiment.py b/src/plots/sentiment.py index 7a9f48c..0f447ac 100644 --- a/src/plots/sentiment.py +++ b/src/plots/sentiment.py @@ -1,138 +1,135 @@ import click -from data.main import connect -import os -from pathlib import Path +from data.main import connect, paths, ticklabels import seaborn as sns import matplotlib.pyplot as plt -import numpy as np -import pandas as pd - -out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures' @click.command('plot:sentiment-over-time') def over_time(): - filename = "sentiment_over_time.png" - DB = connect() - data = DB.sql(""" - SELECT - avg(sent.class_id) as sentiment - ,s.published_at as date - FROM top.story_sentiments sent - JOIN top.stories s - ON s.id = sent.story_id - GROUP BY - s.published_at - """).df() - DB.close() + filename = "sentiment_over_time.png" + save_to = paths('figures') / filename + + with connect() as db: + data = db.sql(""" + SELECT + avg(sent.class_id) as sentiment + ,s.published_at as date + FROM top.story_sentiments sent + JOIN top.stories s + ON s.id = sent.story_id + GROUP BY + s.published_at + """).df() ax = sns.scatterplot(x=data['date'], y=data['sentiment']) ax.set(title="sentiment vs. time") plt.tight_layout() - plt.savefig(out_path / filename) - print(f"saved: {filename}") + plt.savefig(save_to) + plt.close() + print(f"saved: {save_to}") @click.command('plot:bias-vs-sentiment-over-time') def bias_over_time(): + """plot sentiment/bias vs. time""" + filename = "bias_vs_sentiment_over_time.png" + save_to = paths('figures') / filename - DB = connect() - data = DB.sql(""" - SELECT - avg(sent.class_id) as sentiment - ,date_trunc('yearweek', s.published_at) as date - --,b.ordinal as ordinal - ,b.bias - FROM top.story_sentiments sent - JOIN top.stories s - ON s.id = sent.story_id - JOIN publisher_bias pb - ON pb.publisher_id = s.publisher_id - JOIN bias_ratings b - ON b.id = pb.bias_id - GROUP BY - date_trunc('yearweek', s.published_at) - ,b.bias - """).df() - DB.close() + with connect() as db: + data = db.sql(""" + with cte as ( + SELECT + avg(sent.class_id) as sentiment + ,date_trunc('yearweek', s.published_at) as date + ,p.bias + FROM story_sentiments sent + JOIN stories s + ON s.id = sent.story_id + JOIN mbfc.publisher_stories ps + ON ps.story_id = s.id + JOIN mbfc.publishers p + ON p.id = ps.publisher_id + WHERE p.ordinal != -1 + GROUP BY + date_trunc('yearweek', s.published_at) + ,p.bias + ) + SELECT + median(sentiment) OVER (PARTITION BY bias ORDER BY date DESC ROWS BETWEEN 0 PRECEDING AND 7 FOLLOWING) as sentiment + ,date + ,bias + FROM cte + WHERE year(date) not in (2005, 2023) + """).df() - order = ['left', 'left-center', 'center', 'right-center', 'right'] - ax = sns.relplot(data, x='date', y='sentiment', col='bias', col_order=order) + #ax = sns.relplot(data, x='date', y='sentiment', col='bias', palette='rainbow', hue='bias', col_order=ticklabels()) + ax = sns.lineplot(data, x='date', y='sentiment', palette='rainbow', hue='bias', hue_order=ticklabels()) + plt.axhline(y=0.5, color='black', linestyle='--', label='neutral') + ax.set(title='sentiment and bias vs. time', ylabel='8 week rolling avg. sentiment', xlabel='date') plt.tight_layout() - plt.savefig(out_path / filename) + plt.savefig(save_to) plt.close() - print(f"saved: {filename}") + print(f"saved: {save_to}") @click.command('plot:sentiment-recent-winner') def bias_vs_recent_winner(): + """plot bias vs. distance to election""" + filename = "bias_vs_recent_winner.png" + save_to = paths('figures') / filename - DB = connect() - data = DB.sql(""" - SELECT - e.days_away as days_away - ,b.ordinal - ,avg(sent.class_id) as sentiment - ,count(1) as stories - FROM top.stories s - JOIN top.story_sentiments sent - ON s.id = sent.story_id - JOIN election_distance e - ON e.publish_date = s.published_at - JOIN publisher_bias pb - ON pb.publisher_id = s.publisher_id - JOIN bias_ratings b - ON b.id = pb.bias_id - GROUP BY - e.days_away - ,b.ordinal - """).df() - DB.close() - data + with connect() as db: + data = db.sql(""" + SELECT + round(e.days_away, -1) as days_away + ,p.bias + ,avg(sent.class_id) as sentiment + ,count(1) as stories + FROM stories s + JOIN story_sentiments sent + ON s.id = sent.story_id + JOIN election_distance e + ON e.publish_date = s.published_at + JOIN mbfc.publisher_stories ps + ON ps.story_id = s.id + JOIN mbfc.publishers p + ON p.id = ps.publisher_id + GROUP BY + round(e.days_away, -1) + ,p.bias + """).df() - ax = sns.scatterplot(x=data['days_away'], y=data['sentiment'], hue=data['ordinal']) + ax = sns.scatterplot(data, x='days_away', y='sentiment', hue='bias', hue_order=ticklabels(), palette='rainbow') ax.set(title="sentiment vs. days to nearest election", xlabel="days to nearest election", ylabel="average title seniment") plt.tight_layout() - plt.savefig(out_path / filename) + plt.savefig(save_to) plt.close() - - print(f"saved: {filename}") + print(f"saved: {save_to}") @click.command('plot:sentiment-hist') def sentiment_hist(): + filename = "sentiment_hist.png" + save_to = paths('figures') / filename - DB = connect() + with connect() as db: + data = db.sql(""" + SELECT + p.bias + ,count(1) as stories + FROM stories s + JOIN mbfc.publisher_stories ps + ON ps.story_id = s.id + JOIN mbfc.publishers p + ON p.id = ps.publisher_id + WHERE p.ordinal != -1 + GROUP BY + p.bias + """).df() - DB.query(""" - select - sent.label - ,count(distinct s.id) as stories - ,count(distinct s.publisher_id) as publishers - from top.story_sentiments sent - join top.stories s - on s.id = sent.story_id - group by - sent.label - """).df().to_markdown(index=False) - - data = DB.sql(""" - SELECT - b.ordinal - ,count(1) as stories - FROM stories s - JOIN publisher_bias pb - ON pb.publisher_id = s.publisher_id - JOIN bias_ratings b - ON b.id = pb.bias_id - GROUP BY - b.ordinal - """).df() - DB.close() - - ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue') - ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] - ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels) + ax = sns.barplot(data, x='bias', y='stories', hue='bias', palette='rainbow') + ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels()) plt.tight_layout() - plt.savefig(out_path / filename) - print(f"saved: {filename}") + plt.savefig(save_to) + plt.close() + print(f"saved: {save_to}") diff --git a/src/selection.py b/src/selection.py deleted file mode 100644 index 9c34543..0000000 --- a/src/selection.py +++ /dev/null @@ -1,48 +0,0 @@ -from data.main import connect -import pandas as pd -import numpy as np - -DB = connect() -edges = DB.query(""" - select - * - from link_edges -""").df() -DB.close() - -edges - -adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0) -select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id']) - -DB = connect() -DB.query("create schema top") - -DB.query(""" - CREATE OR REPLACE TABLE top.publishers AS - SELECT - p.* - FROM publishers p - JOIN select_publishers s - ON s.publisher_id = p.id -""") - -DB.query(""" - CREATE OR REPLACE TABLE top.stories AS - SELECT - s.* - FROM stories s - JOIN top.publishers p - ON s.publisher_id = p.id - WHERE year(s.published_at) >= 2006 - AND year(s.published_at) < 2023 -""") - -DB.query(""" - CREATE OR REPLACE TABLE top.related_stories AS - SELECT - r.* - FROM top.stories s - JOIN related_stories r - ON s.id = r.parent_id -""") diff --git a/src/sentence.py b/src/sentence.py index 52c447e..4d84471 100644 --- a/src/sentence.py +++ b/src/sentence.py @@ -1,7 +1,7 @@ from transformers import AutoTokenizer, AutoModel import torch import torch.nn.functional as F -from data.main import connect, data_dir +from data.main import connect, paths import os from pathlib import Path import numpy as np @@ -62,7 +62,7 @@ def embed(chunks): ids = np.concatenate(embedding_ids) # save embeddings - save_to = data_dir() / 'embeddings.npy' + save_to = paths('data') / 'embeddings.npy' np.save(save_to, embeddings) print(f"embeddings saved: {save_to}") @@ -75,29 +75,28 @@ def embed(chunks): @click.command('sentence:create-avg-pca-table') def create_avg_pca_table(): from sklearn.decomposition import PCA - data_path = Path(os.getenv('DATA_MINING_DATA_DIR')) - embeddings = np.load(data_path / 'embeddings.npy') - embedding_ids = np.load(data_path / 'embedding_ids.npy') + embeddings = np.load(paths('data') / 'embeddings.npy') + embedding_ids = np.load(paths('data') / 'embedding_ids.npy') ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index() - DB = connect() - data = DB.query(""" - SELECT - ids.index - ,s.id - ,s.publisher_id - ,b.ordinal - FROM ids - JOIN top.stories s - ON ids.story_id = s.id - JOIN top.publisher_bias pb - ON pb.publisher_id = s.publisher_id - JOIN bias_ratings b - ON b.id = pb.bias_id - """).df() - DB.close() + + with connect() as db: + data = db.query(""" + SELECT + ids.index + ,s.id + ,s.publisher_id + ,b.ordinal + FROM ids + JOIN top.stories s + ON ids.story_id = s.id + JOIN top.publisher_bias pb + ON pb.publisher_id = s.publisher_id + JOIN bias_ratings b + ON b.id = pb.bias_id + """).df() results = [] for publisher_id, group in data.groupby(['publisher_id']): @@ -115,47 +114,45 @@ def create_avg_pca_table(): results['second'] = pred[:, 1] table_name = "top.publisher_embeddings_pca" - DB = connect() - DB.query(f""" - CREATE OR REPLACE TABLE {table_name} AS - SELECT - results.publisher_id as publisher_id - ,results.first as first - ,results.second as second - FROM results - """) - DB.close() + with connect() as db: + db.query(f""" + CREATE OR REPLACE TABLE {table_name} AS + SELECT + results.publisher_id as publisher_id + ,results.first as first + ,results.second as second + FROM results + """) + print(f"created {table_name}") @click.command('sentence:create-pca-table') def create_pca_table(): from sklearn.decomposition import PCA - data_path = Path(os.getenv('DATA_MINING_DATA_DIR')) - embeddings = np.load(data_path / 'embeddings.npy') - embedding_ids = np.load(data_path / 'embedding_ids.npy') + embeddings = np.load(path('data') / 'embeddings.npy') + embedding_ids = np.load(path('data') / 'embedding_ids.npy') - DB = connect() - data = DB.query(""" - SELECT - ids.index - ,s.id - ,b.ordinal - FROM ids - JOIN top.stories s - ON ids.story_id = s.id - JOIN top.publisher_bias pb - ON pb.publisher_id = s.publisher_id - JOIN bias_ratings b - ON b.id = pb.bias_id - """).df() - pub = DB.query(""" - SELECT - * - FROM top.publishers - """).df() - DB.close() + with connect() as db: + data = db.query(""" + SELECT + ids.index + ,s.id + ,b.ordinal + FROM ids + JOIN top.stories s + ON ids.story_id = s.id + JOIN top.publisher_bias pb + ON pb.publisher_id = s.publisher_id + JOIN bias_ratings b + ON b.id = pb.bias_id + """).df() + pub = db.query(""" + SELECT + * + FROM top.publishers + """).df() x = embeddings[data['index']] y = data['ordinal'].to_numpy().reshape(-1, 1) @@ -166,42 +163,41 @@ def create_pca_table(): table_name = f"top.story_embeddings_pca" - DB = connect() - DB.query(f""" - CREATE OR REPLACE TABLE {table_name} AS - SELECT - data.id as story_id - ,data.first as first - ,data.second as second - FROM data - """) - DB.close() + with connect() as db: + db.query(f""" + CREATE OR REPLACE TABLE {table_name} AS + SELECT + data.id as story_id + ,data.first as first + ,data.second as second + FROM data + """) + print(f"created {table_name}") @click.command('sentence:create-svm-table') def create_svm_table(): from sklearn import svm from sklearn.linear_model import SGDClassifier - data_path = Path(os.getenv('DATA_MINING_DATA_DIR')) - embeddings = np.load(data_path / 'embeddings.npy') - embedding_ids = np.load(data_path / 'embedding_ids.npy') + embeddings = np.load(paths('data') / 'embeddings.npy') + embedding_ids = np.load(paths('data') / 'embedding_ids.npy') ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index() - DB = connect() - data = DB.query(""" - SELECT - ids.index - ,s.id - ,b.ordinal - FROM ids - JOIN top.stories s - ON ids.story_id = s.id - JOIN top.publisher_bias pb - ON pb.publisher_id = s.publisher_id - JOIN bias_ratings b - ON b.id = pb.bias_id - """).df() + with connect() as db: + data = db.query(""" + SELECT + ids.index + ,s.id + ,b.ordinal + FROM ids + JOIN top.stories s + ON ids.story_id = s.id + JOIN top.publisher_bias pb + ON pb.publisher_id = s.publisher_id + JOIN bias_ratings b + ON b.id = pb.bias_id + """).df() x = embeddings[data['index']] #y = data['ordinal'].to_numpy().reshape(-1, 1)