Merge branch 'feature_factcheck'

2023-06-01 09:44:28 -07:00 · 2023-06-01 09:44:28 -07:00 · 81f4f37c9d
parent 398228f02c 79808f01d6
commit 81f4f37c9d
40 changed files with 1354 additions and 1137 deletions
--- a/docs/figures/bias_hist.png
+++ b/docs/figures/bias_hist.png
--- a/docs/figures/bias_over_time.png
+++ b/docs/figures/bias_over_time.png
--- a/docs/figures/bias_publisher_hist.png
+++ b/docs/figures/bias_publisher_hist.png
--- a/docs/figures/bias_vs_recent_winner.png
+++ b/docs/figures/bias_vs_recent_winner.png
--- a/docs/figures/bias_vs_sentiment_over_time.png
+++ b/docs/figures/bias_vs_sentiment_over_time.png
--- a/docs/figures/emotion_hist.png
+++ b/docs/figures/emotion_hist.png
--- a/docs/figures/emotion_over_time.png
+++ b/docs/figures/emotion_over_time.png
--- a/docs/figures/emotion_regression.png
+++ b/docs/figures/emotion_regression.png
--- a/docs/figures/link_links_pca_with_classes.png
+++ b/docs/figures/link_links_pca_with_classes.png
--- a/docs/figures/link_normalized_pca_with_classes.png
+++ b/docs/figures/link_normalized_pca_with_classes.png
--- a/docs/figures/link_onehot_pca_with_classes.png
+++ b/docs/figures/link_onehot_pca_with_classes.png
--- a/docs/figures/link_{source}_pca_with_classes.png
+++ b/docs/figures/link_{source}_pca_with_classes.png
--- a/docs/figures/pca_with_classes.png
+++ b/docs/figures/pca_with_classes.png
--- a/docs/figures/sentiment_hist.png
+++ b/docs/figures/sentiment_hist.png
--- a/docs/figures/sentiment_over_time.png
+++ b/docs/figures/sentiment_over_time.png
--- a/docs/figures/stories_per_publisher.png
+++ b/docs/figures/stories_per_publisher.png
--- a/docs/figures/top_publishers.png
+++ b/docs/figures/top_publishers.png
--- a/src/apriori.py
+++ b/src/apriori.py
@ -0,0 +1,27 @@
 from efficient_apriori import apriori
 from data.main import connect
@click.command("apriori:rules")
 def rules():
    DB = connect()
    data = DB.query("""
        SELECT
            --list_prepend(parent.id, list(child.id)) as transaction
            list_prepend(parent.tld, list(child.tld)) as transaction
        FROM stories s
        JOIN related_stories r
        ON r.parent_id = s.id
        JOIN publishers parent
        ON parent.id = s.publisher_id
        JOIN publishers child
        ON child.id = r.publisher_id
        GROUP BY
            --parent.id
            parent.tld
    """).df()
    DB.close()
    transactions = data.transaction.apply(lambda x: tuple(x)).values
    itemsets, rules = apriori(transactions, min_support=0.1, min_confidence=0.8)
    print(*rules, sep="\n")
--- a/src/bias.py
+++ b/src/bias.py
@ -1,67 +1,42 @@
 import click
-from data.main import connect
+from data.main import connect, paths
 import pandas as pd
 from lxml import etree
 from pathlib import Path
 import os
 import csv
 def label_to_int(rating:str) -> int:
    mapping = {
        'left' : 0,
        'left-center' : 1,
        'center' : 2,
        'right-center' : 3,
        'right' : 4,
        'allsides' : -1,
    }
    return mapping[rating]
 def int_to_label(class_id: int) -> str:
    mapping = {
            0 : 'left',
            1 : 'left-center',
            2 : 'center',
            3 : 'right-center',
            4 : 'right',
            -1 : 'allsides',
    }
    return mapping[class_id]
@click.command(name="bias:normalize")
 def normalize() -> None:
-    DB = connect()
+    with connect() as db:
-
+        db.sql("""
-    DB.sql("""
+            CREATE OR REPLACE TABLE publisher_bias AS
-        CREATE OR REPLACE TABLE publisher_bias AS
+            WITH cte AS (
-        WITH cte AS (
+                SELECT
-            SELECT
+                    p.id as publisher_id
-                p.id as publisher_id
+                    ,b.id as bias_id
-                ,b.id as bias_id
+                    ,b.bias as label
-                ,b.bias as label
+                    ,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity
-                ,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity
+                FROM bias_ratings b
-            FROM bias_ratings b
+                JOIN top.publishers p
-            JOIN top.publishers p
+                ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95
-            ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95
+            ),ranked AS (
-        ),ranked AS (
+                SELECT
                    publisher_id
                    ,bias_id
                    ,label
                    ,similarity
                    ,ROW_NUMBER() OVER(PARTITION BY publisher_id ORDER BY similarity DESC) AS rn
                FROM cte
            )
            SELECT
                publisher_id
                ,bias_id
                ,label
-                ,similarity
+                ,bias_id
-                ,ROW_NUMBER() OVER(PARTITION BY publisher_id ORDER BY similarity DESC) AS rn
+            FROM ranked
-            FROM cte
+            WHERE ranked.rn = 1
-        )
+        """)
        SELECT
            publisher_id
            ,label
            ,bias_id
        FROM ranked
        WHERE ranked.rn = 1
    """)
    mapping = [
            {'label' :'left' , 'ordinal': -2},
@ -72,22 +47,20 @@ def normalize() -> None:
    ]
    mapping = pd.DataFrame(mapping)
-    DB.query("alter table bias_ratings add column ordinal int")
+    with connect() as db:
-
+        db.query("alter table bias_ratings add column ordinal int")
-    DB.query("""
+        db.query("""
-        update bias_ratings b
+            update bias_ratings b
-        set ordinal = o.ordinal
+            set ordinal = o.ordinal
-        FROM mapping o
+            FROM mapping o
-        WHERE o.label = b.bias
+            WHERE o.label = b.bias
-    """)
+        """)
@click.command(name='bias:parse')
 def parse() -> None:
    """parse the save html page of allslides.com bias ratings into a normalized csv file"""
-    DB = connect()
+    bias_html = paths('data') / 'allsides.html'
    DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
    bias_html = DATA_DIR / 'allsides.html'
    parser = etree.HTMLParser()
    tree = etree.parse(str(bias_html), parser)
@ -111,65 +84,63 @@ def parse() -> None:
        rating['disagree'] = int(disagree)
        ratings.append(rating)
    df = pd.DataFrame(ratings)
-    df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
+    df.to_csv(paths('data') / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
@click.command(name="bias:load")
 def load() -> None:
-    DB = connect()
+    f = str(paths('data') / "bias_ratings.csv")
    DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
    f = str(DATA_DIR / "bias_ratings.csv")
-    DB.sql(f"""
+    with connect() as db:
-        CREATE TABLE bias_ratings as 
+        db.sql(f"""
-        select 
+            CREATE TABLE bias_ratings as 
-            row_number() over(order by b.publisher) as id
+            select 
-            ,b.*
+                row_number() over(order by b.publisher) as id
-        from read_csv_auto('{f}') b
+                ,b.*
-    """)
+            from read_csv_auto('{f}') b
        """)
@click.command('bias:export')
 def export():
-    data_path = Path(os.environ['DATA_MINING_DATA_DIR'])
+    with connect() as db:
        all_bias = db.query("""
            SELECT
                id as bias_id
                ,publisher as name
                ,bias as label
            FROM bias_ratings
            ORDER by agree desc
        """)
-    DB = connect()
+    all_bias.df().to_csv(paths('data') / 'TMP_publisher_bias.csv', sep="|", index=False)
-    all_bias = DB.query("""
+    with connect() as db:
-        SELECT
+        mapped_bias = db.query("""
-            id as bias_id
+            SELECT
-            ,publisher as name
+                p.id as publisher_id
-            ,bias as label
+                ,p.name as name
-        FROM bias_ratings
+                ,p.tld as tld
-        ORDER by agree desc
+                ,b.label as bias
                ,b.bias_id as bias_id
            FROM top.publishers p
            LEFT JOIN publisher_bias b
            ON b.publisher_id = p.id
    """)
-    all_bias.df().to_csv(data_path / 'TMP_publisher_bias.csv', sep="|", index=False)
+    mapped_bias.df().to_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|", index=False)
    mapped_bias = DB.query("""
        SELECT
            p.id as publisher_id
            ,p.name as name
            ,p.tld as tld
            ,b.label as bias
            ,b.bias_id as bias_id
        FROM top.publishers p
        LEFT JOIN publisher_bias b
        ON b.publisher_id = p.id
    """)
    mapped_bias.df().to_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|", index=False)
    DB.close()
@click.command('bias:import-mapped')
 def import_mapped():
    data_path = Path(os.environ['DATA_MINING_DATA_DIR'])
    table_name = "top.publisher_bias"
-    DB = connect()
+    df = pd.read_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|")
-    df = pd.read_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|")
+
    with connect() as db:
        db.query(f"""
            CREATE OR REPLACE TABLE {table_name} AS
            SELECT
                publisher_id AS publisher_id
                ,cast(bias_id AS int) as bias_id
            FROM df
            WHERE bias_id IS NOT NULL
        """)
    DB.query(f"""
        CREATE OR REPLACE TABLE {table_name} AS
        SELECT
            publisher_id AS publisher_id
            ,cast(bias_id AS int) as bias_id
        FROM df
        WHERE bias_id IS NOT NULL
    """)
    print(f"created table: {table_name}")
--- a/src/cli.py
+++ b/src/cli.py
@ -1,5 +1,7 @@
 import click
 from dotenv import load_dotenv
 import data
 import plots
@click.group()
 def cli():
@ -7,12 +9,20 @@ def cli():
 if __name__ == "__main__":
    load_dotenv()
-    from data import scrape
+
-    cli.add_command(scrape.download)
+    # original bias ratings
-    cli.add_command(scrape.parse)
+    cli.add_command(data.scrape.download)
-    cli.add_command(scrape.load)
+    cli.add_command(data.scrape.parse)
-    cli.add_command(scrape.normalize)
+    cli.add_command(data.scrape.load)
-    cli.add_command(scrape.create_elections_table)
+    cli.add_command(data.scrape.normalize)
    cli.add_command(data.scrape.create_elections_table)
    cli.add_command(data.factcheck.parse_index)
    cli.add_command(data.factcheck.scrape)
    cli.add_command(data.links.create_table)
    cli.add_command(data.links.create_pca)
    cli.add_command(data.links.create_clusters)
    import word
    # cli.add_command(word.distance)
@ -23,10 +33,12 @@ if __name__ == "__main__":
    cli.add_command(bias.parse)
    cli.add_command(bias.load)
    cli.add_command(bias.normalize)
    import mine
    cli.add_command(mine.embeddings)
    cli.add_command(mine.cluster)
    cli.add_command(mine.plot)
    import emotion
    cli.add_command(emotion.extract)
    cli.add_command(emotion.normalize)
@ -40,34 +52,20 @@ if __name__ == "__main__":
    from train import main as train_main
    cli.add_command(train_main.main)
-    import plots.descriptive as plotd
+    cli.add_command(plots.descriptive.articles_per_year)
-    cli.add_command(plotd.articles_per_year)
+    cli.add_command(plots.descriptive.distinct_publishers)
-    cli.add_command(plotd.distinct_publishers)
+    cli.add_command(plots.descriptive.stories_per_publisher)
-    cli.add_command(plotd.stories_per_publisher)
+    cli.add_command(plots.descriptive.top_publishers)
-    cli.add_command(plotd.top_publishers)
+    cli.add_command(plots.descriptive.common_tld)
    cli.add_command(plotd.common_tld)
    import  links as linkcli
    cli.add_command(linkcli.create_table)
    cli.add_command(linkcli.create_pca)
    cli.add_command(linkcli.create_clusters)
    import  plots.links as plotl
    cli.add_command(plotl.elbow)
    cli.add_command(plotl.link_pca_clusters)
    import  plots.classifier as plotc
    cli.add_command(plotc.pca_with_classes)
    import plots
    cli.add_command(plots.sentence.sentence_pca)
    cli.add_command(plots.sentence.avg_sentence_pca)
    cli.add_command(plots.emotion.emotion_over_time)
    cli.add_command(plots.emotion.emotion_regression)
    cli.add_command(plots.sentiment.over_time)
    cli.add_command(plots.sentiment.bias_over_time)
    cli.add_command(plots.sentiment.bias_vs_recent_winner)
-
+    cli.add_command(plots.links.elbow)
    cli.add_command(plots.links.link_pca_clusters)
    cli.add_command(plots.classifier.pca_with_classes)
    cli()
--- a/src/data/init.py
+++ b/src/data/init.py
@ -1,6 +1,10 @@
 import data.main
 import data.scrape
 import data.factcheck
 import data.links
 __all__ = [
    'main'
    ,'scrape'
    ,'factcheck'
    ,'links'
 ]
--- a/src/data/factcheck.py
+++ b/src/data/factcheck.py
@ -0,0 +1,171 @@
 import requests
 from lxml import etree
 from bs4 import BeautifulSoup
 import re
 from io import BytesIO
 import pandas as pd
 from pathlib import Path
 import os
 import sys
 import click
 from data.main import connect, map_tld, paths
 from random import randint
 from time import sleep
 from tqdm import tqdm
@click.command('mbfc:parse-index')
 def parse_index():
    parser = etree.HTMLParser()
    publishers = []
    for page in range(1, 54):
        url = f"https://mediabiasfactcheck.com/filtered-search/?pg={page}"
        print(f"downloading {url}", file=sys.stderr)
        response = requests.get(url)
        html = response.content
        tree = etree.parse(BytesIO(html), parser)
        rows = tree.xpath('//table[@class="mbfc-table"]/tbody/tr')
        print(f"parsing {len(rows)} rows", file=sys.stderr)
        for row in rows:
            publisher = {}
            link, bias, reporting, country, credibility, media_type, traffic, popularity = tuple(col for col in row.iterchildren())
            link = link.xpath('./a')[0]
            publisher['name'] = link.text
            publisher['detail_url'] = link.get('href')
            publisher['bias'] = bias.text
            publisher['reporting'] = reporting.text
            publisher['country'] = country.text
            publisher['credibility'] = credibility.text
            publisher['media_type'] = media_type.text
            publisher['traffic'] = traffic.text
            publisher['popularity'] = popularity.xpath('./span')[0].text
            publishers.append(publisher)
    df = pd.DataFrame(publishers)
    save_to = paths('data') / 'mbfc_bias.csv'
    df.to_csv(save_to, sep='|', index=False)
    print(f"saved {len(df)}: {save_to}", file=sys.stderr)
@click.command("mbfc:schema")
 def schema():
    with connect() as db:
        db.sql("""create schema mbfc""")
        db.sql("""create or replace table mbfc.scrape (
            url text
            ,scraped_at datetime default now()
            )
        """)
@click.command("mbfc:scrape")
 def scrape():
    df = pd.read_csv(paths('data') / 'mbfc_bias.csv', sep="|")
    with connect() as db:
        stats  = db.query("""
            select
                count(1) filter(where s.url is not null) as elapsed
                ,count(1) filter(where s.url is null) as remaining
            from df
            left join mbfc.scrape s
            on df.detail_url = s.url
        """).fetchall()
        df = db.query("""
            select
                detail_url as url
            from df
            where df.detail_url not in (
                select
                    url
                from mbfc.scrape
            )
        """).df()
    print(f"{stats[0][0]} elapsed. {stats[0][1]} remaining.")
    for url in df.url:
        delay = randint(1,3)
        save_as = paths('data') / 'mbfc' / (url.strip('/').split('/')[-1] + '.html')
        print(f"downloading (delay: {delay}): {url}", file=sys.stderr)
        sleep(delay)
        try:
            response = requests.get(url)
        except Exception as e:
            print(f"request failed: {url}", file=sys.stderr)
            continue
        with open(save_as, 'w') as f:
            f.write(response.text)
        with connect() as db:
            db.execute("""insert into mbfc.scrape (url) values (?)""", [url])
        print(f"saved: {save_as}", file=sys.stderr)
 def load():
    publishers = []
    for i, page in enumerate(tqdm((paths('data') / 'mbfc').iterdir())):
        publisher = {}
        publisher['origin_url'] = f"https://mediabiasfactcheck.com/{page.stem}"
        with page.open() as p:
            tree = BeautifulSoup(p, 'html.parser')
        for e in tree(string=re.compile(r'source:', re.IGNORECASE)):
            e = e.parent
            while e.name != 'p':
                e = e.parent
            l = e.find('a')
            if l:
                publisher['tld'] = l.get('href')
                break
            else:
                breakpoint()
        publishers.append(publisher)
    df = pd.DataFrame(publishers)
    df.to_csv(paths('data') / 'mbfc_publisher_url.csv', index=False, sep="|")
@click.command('mbfc:create-tables')
 def create_tables():
    pubs = pd.read_csv(paths('data') / 'mbfc_publishers.csv', sep='|')
    urls = pd.read_csv(paths('data') / 'mbfc_publisher_url.csv', sep="|")
    df = pubs.merge(urls, on='mbfc_url')
    df['tld'] = df.tld.apply(map_tld)
    df['ordinal'] = df.bias.apply(bias_label_to_int)
    with connect() as db:
        db.sql("""
            CREATE OR REPLACE TABLE mbfc.publishers AS
            SELECT
                row_number() over() as id
                ,p.tld
                ,mode(p.name) as name
                ,mode(p.bias) as bias
                ,mode(p.ordinal) as ordinal
                ,mode(p.reporting) as reporting
                ,mode(p.country) as country
                ,mode(p.credibility) as credibility
                ,mode(p.media_type) as media_type
                ,mode(p.traffic) as traffic
                ,mode(p.popularity) as popularity
            FROM df p
            GROUP BY
                p.tld
        """)
    with connect() as db:
        raw_stories = db.sql("""
            SELECT
                *
            FROM stories s
        """).df()
    stories['tld'] = stories.url.apply(map_tld)
    with connect() as db:
        db.sql("""
            CREATE OR REPLACE TABLE mbfc.publisher_stories AS
            SELECT
               s.id as story_id
               ,p.id as publisher_id
            FROM raw_stories s
            JOIN mbfc.publishers p
            ON p.tld = s.tld
        """)
--- a/src/data/links.py
+++ b/src/data/links.py
@ -0,0 +1,135 @@
 import click
 from data.main import connect
 import pandas as pd
@click.command('links:create-table')
 def create_table():
    with connect() as db:
        db.query(f"""
            CREATE OR REPLACE TABLE link_edges AS
            with cte as(
                SELECT 
                    s.publisher_id as parent_id
                    ,r.publisher_id as child_id
                    ,count(1) as links
                FROM stories s
                JOIN related_stories r
                ON s.id = r.parent_id
                group by 
                    s.publisher_id
                    ,r.publisher_id
            )
            SELECT
                cte.parent_id
                ,cte.child_id
                ,cte.links as links
                ,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
                ,case when cte.links > 0 then 1 else 0 end as onehot
            FROM cte
            WHERE cte.child_id in (
                SELECT
                    distinct parent_id
                FROM cte
            )
            AND cte.parent_id in (
                SELECT
                    distinct child_id
                FROM cte
            )
        """)
        db.query("""
            SELECT
            *
            ,count(1) over()
            FROM link_edges e
            limit 1
        """)
    print(f"created link_edges")
@click.command('links:create-pca')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
 def create_pca(source):
    """create 2D pca labels"""
    from sklearn.decomposition import PCA
    table_name = f"publisher_pca_{source}"
    with connect() as db:
        pub = db.query("""
            SELECT
                p.*
            FROM mbfc.publishers p
            JOIN mbfc.publisher_stories ps
            ON p.id = ps.publisher_id
        """).df()
        df = db.query(f"""
            SELECT
                parent_id
                ,child_id
                ,{source} as links
            FROM link_edges
        """).df()
    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
    svd = PCA(n_components=2)
    svd_out = svd.fit_transform(pivot)
    out = pivot.reset_index()[['parent_id']]
    out['first'] = svd_out[:, 0]
    out['second'] = svd_out[:, 1]
    out = pd.merge(out, pub, left_on='parent_id', right_on='id')
    with connect() as db:
        db.query(f"""
            CREATE OR REPLACE TABLE {table_name} AS
            SELECT
                out.id as publisher_id
                ,out.first as first
                ,out.second as second
            FROM out
        """)
    print(f"created {table_name}")
@click.command('links:create-clusters')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
 def create_clusters(source):
    """create link adj. matrix clusters table"""
    from sklearn.cluster import KMeans
    table_name = f"publisher_clusters_{source}"
    with connect() as db:
        df = db.query(f"""
            SELECT
                parent_id
                ,child_id
                ,{source} as links
            FROM link_edges
        """).df()
        pub = db.query("""
            SELECT
                p.*
            FROM mbfc.publishers p
            JOIN mbfc.publisher_stories ps
            ON ps.publisher_id = p.id
        """).df()
    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
    k = 8
    kmeans = KMeans(n_clusters=k, n_init="auto")
    pred = kmeans.fit_predict(pivot)
    out = pivot.reset_index()[['parent_id']]
    out['label'] = pred
    out = pd.merge(out, pub, left_on='parent_id', right_on='id')
    new_table = out[['id', 'label']]
    with connect() as db:
        db.query(f"""
            CREATE OR REPLACE TABLE {table_name} AS
            SELECT
                n.id as publisher_id
                ,n.label as label
            FROM new_table n
        """)
    print(f"created {table_name}")
--- a/src/data/main.py
+++ b/src/data/main.py
@ -2,6 +2,10 @@ import os
 from pathlib import Path
 import duckdb
 from enum import Enum
 from urllib.parse import urlparse
 from tld import get_tld
 from tld.utils import update_tld_names
 import sys
 class Data(str, Enum):
    Titles = 'titles'
@ -9,6 +13,16 @@ class Data(str, Enum):
 def data_dir():
    return Path(os.environ['DATA_MINING_DATA_DIR'])
 def paths(name='app'):
    if 'app' in name:
        return Path(os.environ['DATA_MINING_APP_DIR'])
    if 'data' in name:
        return Path(os.environ['DATA_MINING_DATA_DIR'])
    if 'doc' in name:
        return Path(os.environ['DATA_MINING_DOCS_DIR'])
    if 'figure' in name:
        return Path(os.environ['DATA_MINING_DOCS_DIR']) / 'figures'
 def connect():
    DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
    # APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
@ -28,3 +42,66 @@ def from_db(t: Data):
            limit 100
        """).df()
    return table
 def map_tld(x):
    try:
        res = get_tld(x, as_object=True)
        return res.fld
    except:
        print(f"'{x}' is not valid.", file=sys.stderr)
        return None
 def ticklabels():
    return [ 
            'Left',
            'Left-Center',
            'Least Biased',
            'Right-Center',
            'Right',
        ]
 def bias_label_to_int(rating:str, source: str = 'mbfc') -> int:
    if source == 'mbfc':
        mapping = {
            'Left' : 0,
            'Left-Center' : 1,
            'Least Biased' : 2,
            'Right-Center' : 3,
            'Right' : 4,
        }
    else:
        mapping = {
            'left' : 0,
            'left-center' : 1,
            'center' : 2,
            'right-center' : 3,
            'right' : 4,
        }
    try:
        return mapping[rating]
    except:
        print(f"no mapping for {rating}", file=sys.stderr)
        return -1
 def bias_int_to_label(class_id: int, source: str = 'mbfc') -> str:
    if source == 'mbfc':
        mapping = {
            0 : 'Left',
            1 : 'Left-Center',
            2 : 'Least Biased',
            3 : 'Right-Center',
            4 : 'Right',
        }
    else:
        mapping = {
                0 : 'left',
                1 : 'left-center',
                2 : 'center',
                3 : 'right-center',
                4 : 'right',
        }
    try:
        return mapping[class_id]
    except:
        print(f"no mapping for {class_id}", file=sys.stderr)
        return -1
--- a/src/data/scrape.py
+++ b/src/data/scrape.py
@ -319,12 +319,6 @@ def another_norm():
    """)
    def map_tld(x):
        try:
            res = get_tld(x, as_object=True)
            return res.fld
        except:
            return None
    DB.sql("""
        SELECT
--- a/src/data/selection.py
+++ b/src/data/selection.py
@ -0,0 +1,47 @@
 from data.main import connect
 import pandas as pd
 import numpy as np
 def create_tables():
    with connect() as db:
        edges = db.query("""
            select
            *
            from link_edges
        """).df()
    adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
    select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
    with connect() as db:
        db.query("create schema top")
        db.query("""
            CREATE OR REPLACE TABLE top.publishers AS
            SELECT
                p.*
            FROM publishers p
            JOIN select_publishers s
            ON s.publisher_id = p.id
        """)
        db.query("""
            CREATE OR REPLACE TABLE top.stories AS
            SELECT
                s.*
            FROM stories s
            JOIN top.publishers p
            ON s.publisher_id = p.id
            WHERE year(s.published_at) >= 2006
            AND year(s.published_at) < 2023
        """)
        db.query("""
            CREATE OR REPLACE TABLE top.related_stories AS
            SELECT
                r.*
            FROM top.stories s
            JOIN related_stories r
            ON s.id = r.parent_id
        """)
--- a/src/data/sentiment.py
+++ b/src/data/sentiment.py
@ -1,10 +1,11 @@
 import click
 from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
 import torch
 import torch.nn.functional as F
-from data import connect, data_dir
+from data.main import connect, paths
 import numpy as np
 from tqdm import tqdm
-import click
+import pandas as pd
@click.option('-c', '--chunks', type=int, default=500, show_default=True)
@click.command("sentiment:extract")
@ -67,20 +68,19 @@ def extract(chunks):
@click.command('sentiment:load')
 def load():
-    DB = connect()
+    sentiments = np.load(paths('data') / 'sentiment.npy')
-    sentiments = np.load(data_dir() / 'sentiment.npy')
+    story_ids = np.load(paths('data') / 'sentiment_ids.npy')
    story_ids = np.load(data_dir() / 'sentiment_ids.npy')
    data = pd.DataFrame(story_ids, columns=['story_id']).reset_index()
    data['sentiment_id'] = sentiments
-    DB.query("""
+    with connect() as db:
-        CREATE OR REPLACE TABLE top.story_sentiments AS
+        db.query("""
-        SELECT
+            CREATE OR REPLACE TABLE story_sentiments AS
-            data.story_id
+            SELECT
-            ,data.sentiment_id as class_id
+                data.story_id
-            ,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label
+                ,data.sentiment_id as class_id
-        FROM data
+                ,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label
-        JOIN top.stories s
+            FROM data
-        ON s.id = data.story_id
+            JOIN stories s
-    """)
+            ON s.id = data.story_id
-    DB.close()
+        """)
--- a/src/links.py
+++ b/src/links.py
@ -1,255 +0,0 @@
 import click
 from data.main import connect
 import pandas as pd
 import numpy as np
 import seaborn as sns
 import matplotlib.pyplot as plt
@click.command('links:create-table')
 def create_table():
    table_name = "top.link_edges"
    DB = connect()
    DB.query(f"""
        CREATE OR REPLACE TABLE {table_name} AS
        with cte as(
            SELECT 
                s.publisher_id as parent_id
                ,r.publisher_id as child_id
                ,count(1) as links
            FROM top.stories s
            JOIN top.related_stories r
            ON s.id = r.parent_id
            group by 
                s.publisher_id
                ,r.publisher_id
        )
        SELECT
            cte.parent_id
            ,cte.child_id
            ,cte.links as links
            ,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
            ,case when cte.links > 0 then 1 else 0 end as onehot
        FROM cte
        WHERE cte.child_id in (
            SELECT
                distinct parent_id
            FROM cte
        )
        AND cte.parent_id in (
            SELECT
                distinct child_id
            FROM cte
        )
    """)
    DB.close()
    DB = connect()
    DB.query("""
        SELECT
        *
        ,-log10(links)
        --distinct parent_id
        FROM top.link_edges e
        WHERE e.parent_id = 238
    """)
    DB.close()
    print(f"created {table_name}")
@click.command('links:create-pca')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
 def create_pca(source):
    """create 2D pca labels"""
    from sklearn.decomposition import PCA
    table_name = f"top.publisher_pca_{source}"
    DB = connect()
    pub = DB.query("""
        SELECT
            *
        FROM top.publishers
    """).df()
    df = DB.query(f"""
        SELECT
            parent_id
            ,child_id
            ,{source} as links
        FROM top.link_edges
    """).df()
    DB.close()
    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
    svd = PCA(n_components=2)
    svd_out = svd.fit_transform(pivot)
    out = pivot.reset_index()[['parent_id']]
    out['first'] = svd_out[:, 0]
    out['second'] = svd_out[:, 1]
    out = pd.merge(out, pub, left_on='parent_id', right_on='id')
    DB = connect()
    DB.query(f"""
        CREATE OR REPLACE TABLE {table_name} AS
        SELECT
            out.id as publisher_id
            ,out.first as first
            ,out.second as second
        FROM out
    """)
    DB.close()
    print(f"created {table_name}")
@click.command('links:create-clusters')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
 def create_clusters(source):
    from sklearn.cluster import KMeans
    table_name = f"top.publisher_clusters_{source}"
    DB = connect()
    df = DB.query(f"""
        SELECT
            parent_id
            ,child_id
            ,{source} as links
        FROM top.link_edges
    """).df()
    pub = DB.query("""
        SELECT
            *
        FROM top.publishers
    """).df()
    DB.close()
    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
    k = 8
    kmeans = KMeans(n_clusters=k, n_init="auto")
    pred = kmeans.fit_predict(pivot)
    out = pivot.reset_index()[['parent_id']]
    out['label'] = pred
    out = pd.merge(out, pub, left_on='parent_id', right_on='id')
    new_table = out[['id', 'label']]
    DB = connect()
    DB.query(f"""
        CREATE OR REPLACE TABLE {table_name} AS
        SELECT
            n.id as publisher_id
            ,n.label as label
        FROM new_table n
    """)
    DB.close()
    print(f"created {table_name}")
 def to_matrix():
    """returns an adjacency matrix of publishers to publisher link frequency"""
    DB = connect()
    bias_map = pd.DataFrame([
        {'label' :'left', 'value' : 0},
        {'label' :'left-center', 'value' : 1},
        {'label' :'center', 'value' : 2},
        {'label' :'right-center', 'value' : 3},
        {'label' :'right', 'value' : 4},
        {'label' :'allsides', 'value' : -1},
    ])
    bias = DB.sql("""
        SELECT
            b.id
            ,b.label
            ,m.value
        FROM publisher_bias b
        JOIN bias_map m
        ON b.label = m.label
        WHERE value != -1
    """).df()
    pub = DB.sql("""
            select 
                p.id
                ,p.name
                ,p.url
            from publishers p
    """).df()
    edges = DB.sql("""
        WITH total as (
            SELECT
                s.publisher_id as id
                ,COUNT(1) as stories
            FROM stories s
            GROUP BY 
                s.publisher_id
        ), p as (
            SELECT
                p.id
                ,stories
            FROM publishers p
            LEFT JOIN total t
            ON t.id = p.id
            WHERE t.stories >= 20
        ), cte as (
            SELECT 
                r.publisher_id as child_id
                ,s.publisher_id as parent_id
                ,count(1) as links
            FROM related_stories r
            JOIN stories s
            ON s.id = r.parent_id
            group by 
                s.publisher_id
                ,r.publisher_id
        )
        SELECT
            p.id as parent_id
            ,cte.child_id
            ,links
        FROM p
        left JOIN cte
        ON p.id = cte.parent_id
    """).df()
    # only keep values that have more than 1 link
    test = edges[edges['links'] > 2].pivot(index='parent_id', columns='child_id', values='links').fillna(0).reset_index()
    edges.dropna().pivot(index='parent_id', columns='child_id', values='links').fillna(0)
    pd.merge(adj, pub, how='left', left_on='parent_id', right_on='id')
    adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
    adj.values.shape
    out = pd.DataFrame(adj.index.values, columns=['id'])
    out = pd.merge(out, pub, how='left', on='id')
    return out
@click.command('links:analysis')
 def analysis():
    from sklearn.decomposition import PCA, TruncatedSVD
    from sklearn.cluster import MiniBatchKMeans
    adj = to_matrix()
    pca = PCA(n_components=4)
    pca_out = pca.fit_transform(adj)
    svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
    svd_out = svd.fit_transform(adj)
    x = svd_out[:, 0]
    y = svd_out[:, 1]
    x = pca_out[:, 0]
    y = pca_out[:, 1]
    sns.scatterplot(x=x, y=y)
    plt.show()
    kmeans = MiniBatchKMeans(n_clusters=5, random_state=0, batch_size=6, n_init="auto")
    pred = kmeans.fit_predict(pca_out)
    sns.scatterplot(x=x, y=y, hue=pred)
    plt.show()
    sns.scatterplot(x=pub['first'], y=pub['second'], hue=pub['bias'])
    plt.show()
--- a/src/mine.py
+++ b/src/mine.py
@ -1,6 +1,5 @@
-from data.main import data_dir, connect
+from data.main import connect, paths
 import numpy as np
 import sklearn
 from sklearn.cluster import MiniBatchKMeans
 import click
 from pathlib import Path
@ -11,7 +10,7 @@ from enum import Enum, auto
@click.command(name="mine:embeddings")
 def embeddings():
-    data = np.load(data_dir() / "embeddings.npy")
+    data = np.load(paths('data') / "embeddings.npy")
    kmeans = MiniBatchKMeans(n_clusters=5,
                             random_state=0,
                             batch_size=6,
@ -76,7 +75,7 @@ class PlotName(str, Enum):
@click.option('-n', '--name', required=True, type=click.Choice(PlotName))
@click.option('-o', '--output', required=False, type=click.Path())
 def plot(name: PlotName, output: Path):
-    output = output if output else APP_DIR / f'docs/{name}.png'
+    output = output if output else paths('figures') / f'{name}.png'
    if name == PlotName.TitleLength:
        fig, ax = plt.subplots(1,1)
        data = db.sql("""
--- a/src/mining/bias.py
+++ b/src/mining/bias.py
@ -0,0 +1,36 @@
 from data.main import connect, map_tld
 import os
 from pathlib import Path
 def normalize():
    with connect() as db:
        db.sql("""
            SELECT
                p.name
                ,count(1) as ctn
                ,sum(ctn) over() as all
            FROM mbfc.publishers p
            JOIN mbfc.publisher_stories ps
            ON ps.publisher_id = p.id
            JOIN stories s
            ON s.id = ps.story_id
            GROUP BY
                p.name
        """)
    with connect() as db:
        db.sql("""
            SELECT
                bias
                ,count(distinct p.id) as publishers
                ,count(1) as stories
                ,count(1) / count(distinct p.id) as ratio
            FROM mbfc.publishers p
            JOIN mbfc.publisher_stories ps
            ON ps.publisher_id = p.id
            JOIN stories s
            ON s.id = ps.story_id
            GROUP BY
                p.bias
            ORDER BY count(1)
        """)
--- a/src/plots/init.py
+++ b/src/plots/init.py
@ -1,9 +1,13 @@
 import plots.sentence
 import plots.emotion
 import plots.sentiment
 import plots.links
 import plots.classifier
 __all__ = [
    'sentence'
    'emotion',
    'sentiment',
    'links',
    'classifier',
 ]
--- a/src/plots/bias.py
+++ b/src/plots/bias.py
@ -1,5 +1,5 @@
 import click
-from data.main import connect
+from data.main import connect, bias_label_to_int, ticklabels
 import os
 from pathlib import Path
 import seaborn as sns
@ -7,54 +7,53 @@ import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:bias-hist')
 def hist():
-    filename = "bias_hist.png"
+    save_to = paths('figures') / "bias_hist.png"
    with connect() as db:
        data = db.sql("""
            SELECT
                p.ordinal
                ,count(1) as stories
            FROM stories s
            JOIN mbfc.publisher_stories ps
            ON s.id = ps.story_id
            JOIN mbfc.publishers p
            ON ps.publisher_id = p.id
            WHERE ordinal != -1
            GROUP BY
                p.ordinal
        """).df()
    DB = connect()
    data = DB.sql("""
        SELECT
            b.ordinal
            ,count(1) as stories
        FROM stories s
        JOIN publisher_bias pb
        ON pb.publisher_id = s.publisher_id
        JOIN bias_ratings b
        ON b.id = pb.bias_id
        GROUP BY
            b.ordinal
    """).df()
    DB.close()
    ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
-    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
+    ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
    ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
    plt.tight_layout()
-    plt.savefig(out_path / filename)
+    plt.savefig(save_to)
-    print(f"saved: {filename}")
+    plt.close()
    print(f"saved: {save_to}")
@click.command('plot:bias-publisher-hist')
 def publisher_hist():
-    filename = "bias_publisher_hist.png"
+    save_to = paths('figures') / "bias_publisher_hist.png"
-    DB = connect()
+    with connect() as db:
-    data = DB.sql("""
+        data = db.sql("""
-        SELECT
+            SELECT
-            b.ordinal
+                p.ordinal
-            ,count(1) as publishers
+                ,count(distinct p.id) as publishers
-        FROM publisher_bias pb
+            FROM mbfc.publishers p
-        JOIN bias_ratings b
+            JOIN mbfc.publisher_stories ps
-        ON b.id = pb.bias_id
+            ON ps.publisher_id = p.id
-        GROUP BY
+            WHERE ordinal != -1
-            b.ordinal
+            GROUP BY
-    """).df()
+                p.ordinal
-    DB.close()
+        """).df()
    ax = sns.barplot(x=data['ordinal'], y=data['publishers'], color='tab:blue')
-    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
+    ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels())
    ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels)
    plt.tight_layout()
-    plt.savefig(out_path / filename)
+    plt.savefig(save_to)
    plt.close()
-    print(f"saved: {filename}")
+    print(f"saved: {save_to}")
--- a/src/plots/classifier.py
+++ b/src/plots/classifier.py
@ -5,30 +5,32 @@ import seaborn as sns
 import matplotlib.pyplot as plt
 from pathlib import Path
 out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:pca-with-classes')
-def pca_with_classes():
+@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
-    filename = "pca_with_classes.png"
+def pca_with_classes(source):
-    DB = connect()
+    save_to = paths('figures') / f"link_{source}_pca_with_classes.png"
-    data = DB.query(f"""
+
-        SELECT
+    with connect() as db:
-            p.tld
+        df = db.query(f"""
-            ,b.bias
+            SELECT
-            ,c.first
+                p.tld
-            ,c.second
+                ,p.bias
-            ,round(cast(b.agree as float) / (b.agree + b.disagree), 2) ratio
+                ,c.first
-        FROM top.publishers p
+                ,c.second
-        JOIN top.publisher_bias pb
+                --,round(cast(b.agree as float) / (b.agree + b.disagree), 2) ratio
-        ON p.id = pb.publisher_id
+            FROM mbfc.publishers p
-        JOIN bias_ratings b
+            JOIN publisher_pca_{source} c
-        ON b.id = pb.bias_id
+            ON c.publisher_id = p.id
-        JOIN top.publisher_pca_normalized c
+            WHERE p.ordinal != -1
-        ON c.publisher_id = p.id
+            ORDER BY p.ordinal
-    """).df()
+        """).df()
-    DB.close()
+
-    ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['bias'], s=100)
+    ax = sns.relplot(df, x='first', y='second', hue='bias', col='bias', s=100, palette='rainbow')
-    ax.set(title="pca components vs. bias labels", xlabel="first pca component", ylabel="second pca component")
+    ax.set(xlabel="first pca component", 
-    plt.savefig(out_dir / filename)
+           ylabel="second pca component")
-    print(f"saved: {filename}")
+    ax.figure.suptitle="pca components vs. bias labels"
    plt.savefig(save_to)
    plt.close()
    print(f"saved: {save_to}")
    os.system(f'xdg-open {save_to}')
--- a/src/plots/descriptive.py
+++ b/src/plots/descriptive.py
@ -1,169 +1,190 @@
 import click
-from data.main import connect
+from data.main import connect, paths
 import os
 import seaborn as sns
 import matplotlib.pyplot as plt
 from pathlib import Path
 import numpy as np
 out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:articles-per-year')
 def articles_per_year():
-    filename = 'articles_per_year.png'
+    save_to = paths('figures') / 'articles_per_year.png'
-    DB = connect()
+    with connect() as db:
-    data = DB.query("""
+        data = DB.query("""
-        select
+            select
-            year(published_at) as year
+                year(published_at) as year
-            ,count(1) as stories
+                ,count(1) as stories
-        from stories
+            from stories
-        group by 
+            group by 
-            year(published_at)
+                year(published_at)
-    """).df()
+        """).df()
    DB.close()
    ax = sns.barplot(x=data.year, y=data.stories, color='tab:blue')
    ax.tick_params(axis='x', rotation=90)
    ax.set(title="count of articles per year", ylabel="count of stories (#)")
    plt.tight_layout()
-    plt.savefig(out_dir / filename)
+    plt.savefig(save_to)
    print(f"saved: {save_to}")
@click.command('plot:distinct-publishers')
 def distinct_publishers():
-    filename = 'distinct_publishers.png'
+    save_to = paths('figures') / 'distinct_publishers.png'
-    DB = connect()
+    with connect() as db:
-    data = DB.query("""
+        data = DB.query("""
-        select
+            select
-            year(published_at) as year
+                year(published_at) as year
-            ,count(distinct publisher_id) as publishers
+                ,count(distinct publisher_id) as publishers
-        from stories
+            from stories
-        group by 
+            group by 
-            year(published_at)
+                year(published_at)
-    """).df()
+        """).df()
    DB.close()
    ax = sns.barplot(x=data.year, y=data.publishers, color='tab:blue')
    ax.tick_params(axis='x', rotation=90)
    ax.set(title="count of publishers per year", ylabel="count of publishers (#)")
    plt.tight_layout()
-    plt.savefig(out_dir / filename)
+    plt.savefig(save_to)
    plt.close()
    print(f"saved: {save_to}")
@click.command('plot:stories-per-publisher')
 def stories_per_publisher():
-    filename = 'stories_per_publisher.png'
+    save_to = paths('figures') / 'stories_per_publisher.png'
-    DB = connect()
+    with connect() as db:
-    data = DB.query("""
+        data = db.query("""
-        with cte as (
+            with cte as (
        select
            publisher_id
            ,year(published_at) as year
            ,count(1) as stories
        from stories
        group by 
            publisher_id
            ,year(published_at)
        ) , agg as (
            select
-                publisher_id
+                ps.publisher_id
-                ,avg(stories) as stories_per_year
+                ,year(s.published_at) as year
-                ,case 
+                ,count(1) as stories
-                    when avg(stories) < 2 then 2
+            from stories s
-                    when avg(stories) < 4 then 4
+            join mbfc.publisher_stories ps
-                    when avg(stories) < 8 then 8
+            on ps.story_id = s.id
                    when avg(stories) < 16 then 16
                    when avg(stories) < 32 then 32
                    when avg(stories) < 64 then 64
                    when avg(stories) < 128 then 128
                    else 129
                end as max_avg
            from cte
            group by 
-                publisher_id
+                ps.publisher_id
-        )
+                ,year(s.published_at)
-        select
+            ) , agg as (
-            max_avg
+                select
-            ,count(1) as publishers
+                    publisher_id
-        from agg
+                    ,avg(stories) as stories_per_year
-        group by
+                    ,case 
-            max_avg
+                        when avg(stories) < 2 then 2
-    """).df()
+                        when avg(stories) < 4 then 4
-    DB.close()
+                        when avg(stories) < 8 then 8
                        when avg(stories) < 16 then 16
                        when avg(stories) < 32 then 32
                        when avg(stories) < 64 then 64
                        when avg(stories) < 128 then 128
                        else 129
                    end as max_avg
                from cte
                group by 
                    publisher_id
            )
            select
                max_avg
                ,count(1) as publishers
            from agg
            group by
                max_avg
        """).df()
    ax = sns.barplot(x=data.max_avg, y=data.publishers, color='tab:blue')
-    ax.set(title="histogram of publisher stories per year", ylabel="count of publishers (#)", xlabel="max average stories / year")
+    ax.set(title="histogram of publisher stories per year", ylabel="count of publishers (#)", xlabel="avg. stories / year", xticklabels=['2', '4', '8', '16', '32', '64', '128', '>128'])
    plt.tight_layout()
-    plt.savefig(out_dir / filename)
+    plt.savefig(save_to)
    plt.close()
    print(f"saved: {save_to}")
@click.command('plot:top-publishers')
 def top_publishers():
    """plot top publishers over time"""
-    filename = 'top_publishers.png'
+    save_to = paths('figures') / 'top_publishers.png'
-    DB = connect()
+    with connect() as db:
-    data = DB.query("""
+        db.query("""
-        select
+                SELECT
-            p.tld
+                    p.tld
-            ,year(published_at) as year
+                    ,p.id
-            ,count(1) as stories
+                FROM mbfc.publishers p
-        from (
+                JOIN mbfc.publisher_stories ps
-            select
+                ON ps.publisher_id = p.id
                JOIN stories s
                ON s.id = ps.story_id
                GROUP BY
                    p.tld
                    ,p.id
                order by count(1) desc
                limit 20
            """)
    with connect() as db:
        data = db.query("""
            WITH p as ( 
                SELECT
                    p.tld
                    ,p.id
                FROM mbfc.publishers p
                JOIN mbfc.publisher_stories ps
                ON ps.publisher_id = p.id
                JOIN stories s
                ON s.id = ps.story_id
                GROUP BY
                    p.tld
                    ,p.id
                order by count(1) desc
                limit 20
            ) 
            SELECT
                p.tld
-                ,p.id
+                ,YEAR(s.published_at) AS year
-            from top.publishers p
+                ,COUNT(1) AS stories
-            join top.stories s
+            FROM stories s
-            on s.publisher_id = p.id
+            JOIN mbfc.publisher_stories ps
-            group by
+            ON ps.story_id = s.id
            JOIN p
            ON p.id = ps.publisher_id
            GROUP by 
                p.tld
-                ,p.id
+                ,YEAR(published_at)
-            order by count(1) desc
+            ORDER BY year, COUNT(DISTINCT s.id) DESC
-            limit 20
+        """).df()
        ) p
        join top.stories s
        on s.publisher_id = p.id
        group by 
            p.tld
            ,year(published_at)
        order by count(distinct s.id) desc
    """).df()
    DB.close()
    pivot = data.pivot(columns='year', index='tld', values='stories')
    ax = sns.heatmap(pivot, cmap="crest")
    ax.set(title="top 20 publishers (by tld)", ylabel="tld", xlabel="stories / year (#)")
    plt.tight_layout()
-    plt.savefig(out_dir / filename)
+    plt.savefig(save_to)
    plt.close()
    print(f"saved: {save_to}")
@click.command('plot:common_tld')
 def common_tld():
    import dataframe_image as dfi
-    filename = 'common_tld.png'
+    save_to = paths('figures') / 'common_tld.png'
-    DB = connect()
+    with connect() as db:
-    data = DB.query("""
+        data = db.query("""
-        select
+            select
-            split_part(url, '.', -1) as tld
+                split_part(url, '.', -1) as tld
-            ,count(1) as publishers
+                ,count(1) as publishers
-            ,case when count(1) < 20
+                ,case when count(1) < 20
-                then string_agg(distinct url, '\t')
+                    then string_agg(distinct url, '\t')
-                else NULL
+                    else NULL
-                end as urls
+                    end as urls
-        from publishers
+            from publishers
-        group by
+            group by
-            split_part(url, '.', -1)
+                split_part(url, '.', -1)
-        order by 
+            order by 
-            count(1) desc
+                count(1) desc
-    """).df()
+        """).df()
-    DB.close()
+    data[:15][['tld', 'publishers']].style.hide(axis="index").export_png(save_to, table_conversion='matplotlib')
    data[:15][['tld', 'publishers']].style.hide(axis="index").export_png(out_dir / filename, table_conversion='matplotlib')
 def stats():
@ -246,7 +267,7 @@ def stats():
@click.command('plot:bias-stats')
 def bias_stats():
    import dataframe_image as dfi
-    filename = 'bias_stats.png'
+    save_to = paths('figures') / 'bias_stats.png'
    DB = connect()
@ -300,3 +321,69 @@ def bias_stats():
    """).df()
    DB.close()
    print(df.to_markdown(index=False))
@click.command('plot:bias-over-time')
 def bias_over_time():
    """plot bias labels over time"""
    save_to = paths('figures') / 'bias_over_time.png'
    with connect() as db:
        df = db.sql("""
            SELECT
                p.bias
                ,p.id
                ,date_trunc('year', s.published_at) as year
                ,count(1) as stories
            FROM stories s
            JOIN mbfc.publisher_stories ps
            ON ps.story_id = s.id
            JOIN mbfc.publishers p
            ON p.id = ps.publisher_id
            where year(s.published_at) not in (2006, 2023)
            and p.ordinal != -1
            GROUP BY
                p.bias
                ,p.id
                ,p.ordinal
                ,date_trunc('year', s.published_at)
            order by 
                p.ordinal
                ,date_trunc('year', s.published_at)
        """).df()
    ax = sns.relplot(df, kind='line', x='year', y='stories', col='bias', units='id', estimator=None, palette='rainbow')
    ax.set(ylabel="stories", xlabel="year")
    plt.tight_layout()
    plt.savefig(save_to)
    plt.close()
    print(f"saved: {save_to}")
 def bias_missing():
    with connect() as db:
        df = db.sql("""
            SELECT
                date_trunc('year', s.published_at) as year
                ,s.tld
                ,count(1) as stories
            FROM stories s
            LEFT JOIN mbfc.publisher_stories ps
            ON ps.story_id = s.id
            WHERE ps.publisher_id is NULL
            AND year(s.published_at) not in (2006, 2023)
            GROUP BY
                s.tld
                ,date_trunc('year', s.published_at)
            HAVING count(1) > 10
            ORDER BY 
                date_trunc('year', s.published_at)
        """).df()
        ax = sns.lineplot(df, x='year', y='stories', units='tld', estimator=None)
        ax.set(ylabel="stories", xlabel="year")
        plt.tight_layout()
        plt.show()
        #plt.savefig(save_to)
        plt.close()
        #print(f"saved: {save_to}")
--- a/src/plots/emotion.py
+++ b/src/plots/emotion.py
@ -1,77 +1,79 @@
 import click
-from data.main import connect
+from data.main import connect, paths, ticklabels
 import os
 from pathlib import Path
 import seaborn as sns
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:emotion-over-time')
 def emotion_over_time():
    filename = "emotion_over_time.png"
    DB = connect()
-    emotions = DB.sql("""
+    filename = "emotion_over_time.png"
-        SELECT
+    save_to = paths('figures') / filename
-            date_trunc('year', s.published_at) AS year
+
-            ,e.label AS emotion
+    with connect() as db:
-            ,count(1) AS stories
+        emotions = db.sql("""
-        FROM top.stories s
+            SELECT
-        JOIN story_emotions se
+                date_trunc('year', s.published_at) AS year
-        ON s.id = se.story_id
+                ,e.label AS emotion
-        JOIN emotions e
+                ,count(1) AS stories
-        ON e.id = se.emotion_id
+            FROM stories s
-        GROUP by
+            JOIN story_emotions se
-            date_trunc('year', s.published_at)
+            ON s.id = se.story_id
-            ,e.label
+            JOIN emotions e
-    """).df()
+            ON e.id = se.emotion_id
-    DB.close()
+            GROUP by
                date_trunc('year', s.published_at)
                ,e.label
        """).df()
    ax = sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
    ax.set(title="title emotions over years", xlabel="year", ylabel="stories (#)")
-    plt.savefig(out_path / filename)
+    plt.savefig(save_to)
-    print(f"saved: {filename}")
+    plt.close()
    print(f"saved: {save_to}")
    os.system(f'xdg-open {save_to}')
@click.command('plot:emotion-regression')
 def emotion_regression():
    """plot emotion over time as regression"""
    from sklearn import linear_model
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import ConfusionMatrixDisplay
    filename = "emotion_regression.png"
    save_to = paths('figures') / filename
-    DB = connect()
+    with connect() as db:
-    emotions = DB.query("""
+        #emotions = db.query("""
-        SELECT
+        #    SELECT
-        label
+        #        label
-        FROM emotions e
+        #    FROM emotions e
-    """).df()['label'].to_list()
+        #""").df()['label'].to_list()
-    DB.close()
+        df = db.sql(f"""
-
+            SELECT
-    DB = connect()
+                epoch(date_trunc('yearweek', s.published_at)) AS date
-    df = DB.sql(f"""
+                ,e.id AS emotion_id
-        SELECT
+                ,p.id as publisher_id
-            epoch(date_trunc('yearweek', s.published_at)) AS date
+                ,count(1) AS stories
-            ,e.id AS emotion_id
+            FROM stories s
-            ,p.id as publisher_id
+            JOIN mbfc.publisher_stories ps
-            ,count(1) AS stories
+            ON ps.story_id = s.id
-        FROM top.stories s
+            JOIN mbfc.publishers p
-        JOIN top.publishers p
+            ON p.id = ps.publisher_id
-        ON p.id = s.publisher_id
+            JOIN story_emotions se
-        JOIN story_emotions se
+            ON s.id = se.story_id
-        ON s.id = se.story_id
+            JOIN emotions e
-        JOIN emotions e
+            ON e.id = se.emotion_id
-        ON e.id = se.emotion_id
+            WHERE p.ordinal != -1
-        GROUP by
+            GROUP by
-            epoch(date_trunc('yearweek', s.published_at))
+                epoch(date_trunc('yearweek', s.published_at))
-            ,p.id
+                ,p.id
-            ,e.id
+                ,e.id
-    """).df()
+        """).df()
    DB.close()
    results = []
    for (emotion_id, publisher_id), group in df.groupby(['emotion_id', 'publisher_id']):
@ -83,77 +85,59 @@ def emotion_regression():
        results.append({'emotion_id' : emotion_id, 'publisher_id':publisher_id, 'per_year' : per_year})
    results = pd.DataFrame(results)
-    DB = connect()
+    with connect() as db:
-    out = DB.query("""
+        out = db.query("""
-        SELECT
+            SELECT
-            e.label as emotion
+                e.label as emotion
-            --,p.tld
+                ,avg(results.per_year) as avg_reg_coef
-            ,avg(results.per_year) as avg_reg_coef
+                ,p.bias
-            ,b.ordinal
+            FROM results
-        FROM results
+            JOIN emotions e
-        JOIN emotions e
+            ON e.id = results.emotion_id
-        ON e.id = results.emotion_id
+            JOIN mbfc.publishers p
-        JOIN top.publishers p
+            ON p.id = results.publisher_id
-        ON p.id = results.publisher_id
+            GROUP BY
-        JOIN publisher_bias pb
+                e.label
-        ON pb.publisher_id = results.publisher_id
+                ,p.bias
-        JOIN bias_ratings b
+        """).df()
        ON b.id = pb.bias_id
        GROUP BY
            e.label
            ,b.ordinal
    """).df()
    DB.close()
    pivot = out.pivot(index=['emotion'], columns=['ordinal'], values=['avg_reg_coef'])
-    ax = sns.heatmap(pivot, cmap='RdBu_r')
+    pivot = out.pivot(index=['emotion'], columns=['bias'], values=['avg_reg_coef'])
-    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
+
    ax = sns.heatmap(pivot, cmap='BrBG', vmin=-0.01, vmax=0.01, center=0)
    #ax = sns.heatmap(pivot, cmap='RdBu_r', center=0)
    ax.set(title="slope of regression (stories/year) by bias and emotion"
-           ,xticklabels=ticklabels
+           ,xticklabels=ticklabels()
           ,xlabel="bias"
           ,ylabel="emotion")
    plt.tight_layout()
-    plt.savefig(out_path / filename)
+    plt.savefig(save_to)
-    print(f"saved: {filename}")
+    plt.close()
    print(f"saved: {save_to}")
@click.command('plot:emotion-hist')
 def emotion_hist():
    filename = "emotion_hist.png"
    save_to = paths('figures') / filename
-    DB = connect()
+    with connect() as db:
-    DB.query("""describe story_emotions""")
+        data = db.sql("""
            SELECT
                p.bias
                ,count(1) as stories
            FROM stories s
            JOIN mbfc.publisher_stories ps
            ON ps.story_id = s.id
            JOIN mbfc.publishers p
            ON p.id = ps.publisher_id
            WHERE p.ordinal != -1
            GROUP BY
                p.bias
        """).df()
-    DB.query("""
+    ax = sns.barplot(data, x='bias', y='stories', palette='rainbow', order=ticklabels())
-        select
+    ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
            e.label
            ,count(distinct s.id) as stories
            ,count(distinct s.publisher_id) as publishers
        from story_emotions se
        join emotions e
        on e.id = se.emotion_id
        join top.stories s
        on s.id = se.story_id
        group by
            e.label
    """).df().to_markdown(index=False)
    data = DB.sql("""
        SELECT
            b.ordinal
            ,count(1) as stories
        FROM stories s
        JOIN publisher_bias pb
        ON pb.publisher_id = s.publisher_id
        JOIN bias_ratings b
        ON b.id = pb.bias_id
        GROUP BY
            b.ordinal
    """).df()
    DB.close()
    ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
    ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
    plt.tight_layout()
-    plt.savefig(out_path / filename)
+    plt.savefig(save_to)
-    print(f"saved: {filename}")
+    plt.close()
    print(f"saved: {save_to}")
--- a/src/plots/links.py
+++ b/src/plots/links.py
@ -9,20 +9,20 @@ import numpy as np
 from sklearn.metrics import silhouette_score
 import pandas as pd
 out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:link-elbow')
 def elbow():
    from sklearn.cluster import KMeans
-    filename = 'link_cluster_elbow.png'
+    save_to = paths('figures') / 'link_cluster_elbow.png'
    with connect() as db:
        df = db.query("""
            SELECT
                *
            FROM link_edges
        """).df()
    DB = connect()
    df = DB.query("""
        SELECT
            *
        FROM link_edges
    """).df()
    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
    to_plot = []
@ -36,8 +36,9 @@ def elbow():
    ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia)
    ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
-    plt.savefig(out_dir / filename)
+    plt.savefig(save_to)
    plt.close()
    print(f"saved plot: {save_to}")
    # randomly pick 8
@ -45,72 +46,65 @@ def elbow():
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
 def link_pca_clusters(source):
-    filename = f"link_pca_clusters_{source}.png"
+    save_to = paths('figures') / f"link_pca_clusters_{source}.png"
-    DB = connect()
+    with connect() as db:
-    df = DB.query(f"""
+        df = db.query(f"""
-        SELECT
+            SELECT
-            c.label as cluster
+                c.label as cluster
-            ,p.tld
+                ,p.tld
-            --,b.label as bias
+                --,b.label as bias
-            ,pca.first
+                ,pca.first
-            ,pca.second
+                ,pca.second
-            ,s.cnt as stories
+                ,s.cnt as stories
-        FROM top.publisher_clusters_{source} c
+            FROM top.publisher_clusters_{source} c
-        JOIN top.publishers p
+            JOIN top.publishers p
-        ON c.publisher_id = p.id
+            ON c.publisher_id = p.id
-        JOIN 
+            JOIN 
-        (
+            (
-            select
+                select
-                s.publisher_id
+                    s.publisher_id
-                ,count(1) as cnt
+                    ,count(1) as cnt
-            FROM top.stories s
+                FROM top.stories s
-            GROUP BY
+                GROUP BY
-                s.publisher_id
+                    s.publisher_id
-        ) s
+            ) s
-        ON s.publisher_id = p.id
+            ON s.publisher_id = p.id
-        JOIN top.publisher_pca_{source} pca
+            JOIN top.publisher_pca_{source} pca
-        ON pca.publisher_id = p.id
+            ON pca.publisher_id = p.id
-    """).df()
+        """).df()
    DB.close()
    ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster'])
    ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component")
-    plt.savefig(out_dir / filename)
+    plt.savefig(save_to)
-
+    print(f"saved plot: {save_to}")
    # .df().groupby(['cluster', 'bias']).describe()
 def test():
    data_dir = Path(os.getenv('DATA_MINING_DATA_DIR'))
-    DB.query("""
+    with connect() as db:
-        SELECT
+        db.query("""
-            p.id as publisher_id
+            SELECT
-            ,p.name
+                p.id as publisher_id
-            ,p.tld
+                ,p.name
-            ,cast(b.bias_id as int) as bias_id
+                ,p.tld
-            ,count(1) as stories
+                ,cast(b.bias_id as int) as bias_id
-        FROM publishers p
+                ,count(1) as stories
-        JOIN stories s
+            FROM publishers p
-        ON s.publisher_id = p.id
+            JOIN stories s
-        JOIN publisher_clusters c
+            ON s.publisher_id = p.id
-        ON c.publisher_id = p.id
+            JOIN publisher_clusters c
-        LEFT JOIN publisher_bias b
+            ON c.publisher_id = p.id
-        ON b.publisher_id = p.id
+            LEFT JOIN publisher_bias b
-        where bias_id is null
+            ON b.publisher_id = p.id
-        group by
+            where bias_id is null
-            p.id
+            group by
-            ,p.name
+                p.id
-            ,p.tld
+                ,p.name
-            ,b.bias_id
+                ,p.tld
-        ORDER BY count(1) desc
+                ,b.bias_id
-    """)
+            ORDER BY count(1) desc
-
+        """)
    # .df().to_csv(data_dir / 'cluster_publishers.csv', sep="|", index=False)
    DB.close()
@click.command('plot:link-confusion')
@ -120,34 +114,36 @@ def link_confusion():
    from sklearn.metrics import ConfusionMatrixDisplay
    filename = "link_confusion.png"
    save_to = paths('figures') / filename
-    DB = connect()
+    with connect() as db:
-    bias = DB.query("""
+        bias = db.query("""
-        SELECT
+            SELECT
-            p.id as publisher_id
+                p.id as publisher_id
-            ,b.ordinal
+                ,b.ordinal
-        FROM top.publishers p
+            FROM top.publishers p
-        JOIN top.publisher_bias pb
+            JOIN top.publisher_bias pb
-        ON pb.publisher_id = p.id
+            ON pb.publisher_id = p.id
-        JOIN bias_ratings b
+            JOIN bias_ratings b
-        ON b.id = pb.bias_id
+            ON b.id = pb.bias_id
-    """).df()
+        """).df()
        df = db.query("""
            SELECT
                *
            FROM top.link_edges
            WHERE parent_id in (
                select
                    publisher_id
                from bias
            )
            AND child_id in (
                select
                    publisher_id
                from bias
            )
        """).df()
    df = DB.query("""
        SELECT
            *
        FROM top.link_edges
        WHERE parent_id in (
            select
                publisher_id
            from bias
        )
        AND child_id in (
            select
                publisher_id
            from bias
        )
    """).df()
    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
    x = pivot.values
@ -166,9 +162,9 @@ def link_confusion():
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
    ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
-    plt.savefig(out_dir / filename)
+    plt.savefig(save_to)
    plt.close()
-    print(f"saved plot: {filename}")
+    print(f"saved plot: {save_to}")
@click.command('plot:link-classifier')
 def link_confusion():
@ -176,49 +172,51 @@ def link_confusion():
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.metrics import ConfusionMatrixDisplay
-    filename = "link_confusion.png"
+    save_to = paths('figures') / "link_confusion.png"
-    DB = connect()
+    with connect() as db:
-    bias = DB.query("""
+        bias = db.query("""
-        SELECT
+            SELECT
-            p.id as publisher_id
+                p.id as publisher_id
-            ,b.ordinal
+                ,b.ordinal
-        FROM top.publishers p
+            FROM top.publishers p
-        JOIN top.publisher_bias pb
+            JOIN top.publisher_bias pb
-        ON pb.publisher_id = p.id
+            ON pb.publisher_id = p.id
-        JOIN bias_ratings b
+            JOIN bias_ratings b
-        ON b.id = pb.bias_id
+            ON b.id = pb.bias_id
-    """).df()
+        """).df()
        df = db.query("""
            SELECT
                *
            FROM top.link_edges
            WHERE parent_id in (
                select
                    publisher_id
                from bias
            )
            AND child_id in (
                select
                    publisher_id
                from bias
            )
        """).df()
    df = DB.query("""
        SELECT
            *
        FROM top.link_edges
        WHERE parent_id in (
            select
                publisher_id
            from bias
        )
        AND child_id in (
            select
                publisher_id
            from bias
        )
    """).df()
    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
    x = pivot.values
    y = bias.sort_values('publisher_id').ordinal
-    data = DB.query(f"""
+    with connect() as db:
-        SELECT
+        data = db.query(f"""
-            p.id as publisher_id
+            SELECT
-            ,pca.first
+                p.id as publisher_id
-            ,pca.second
+                ,pca.first
-        FROM top.publisher_pca_onehot pca
+                ,pca.second
-        JOIN top.publishers p
+            FROM top.publisher_pca_onehot pca
-        ON pca.publisher_id = p.id
+            JOIN top.publishers p
-    """).df()
+            ON pca.publisher_id = p.id
        """).df()
@ -235,11 +233,11 @@ def link_confusion():
    ConfusionMatrixDisplay.from_predictions(data['ordinal'], data['pred'], ax=ax)
    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
    ax.set(title="confusion matrix for link matrix kNN classifier", xticklabels=ticklabels, yticklabels=ticklabels)
-    plt.savefig(out_dir / filename)
+    plt.savefig(save_to)
    plt.close()
-    print(f"saved plot: {filename}")
+    print(f"saved plot: {save_to}")
-    ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
+    # ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
-    plt.savefig(out_dir / filename)
+    # plt.savefig(out_dir / filename)
-    plt.close()
+    # plt.close()
-    print(f"saved plot: {filename}")
+    # print(f"saved plot: {filename}")
--- a/src/plots/sentence.py
+++ b/src/plots/sentence.py
@ -1,5 +1,5 @@
 import click
-from data.main import connect
+from data.main import connect, paths
 import os
 from pathlib import Path
 import seaborn as sns
@ -7,57 +7,52 @@ import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
 data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
@click.command('plot:sentence-pca')
 def sentence_pca():
-    filename = "embedding_sentence_pca.png"
+    save_to = paths('figures') / "embedding_sentence_pca.png"
    DB = connect()
-    data = DB.query("""
+    with connect() as db:
-        SELECT
+        data = db.query("""
-            pca.first
+            SELECT
-            ,pca.second
+                pca.first
-            ,b.bias as label
+                ,pca.second
-        FROM top.story_embeddings_pca pca
+                ,b.bias as label
-        JOIN top.stories s
+            FROM top.story_embeddings_pca pca
-        ON s.id = pca.story_id
+            JOIN top.stories s
-        JOIN top.publisher_bias pb
+            ON s.id = pca.story_id
-        ON pb.publisher_id = s.publisher_id
+            JOIN top.publisher_bias pb
-        JOIN bias_ratings b
+            ON pb.publisher_id = s.publisher_id
-        ON b.id = pb.bias_id
+            JOIN bias_ratings b
-    """).df()
+            ON b.id = pb.bias_id
-    DB.close()
+        """).df()
    ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
    ax.set(title="pca components vs. bias label", xlabel="first component", ylabel="second component")
-    plt.savefig(out_path / filename)
+    plt.savefig(save_to)
@click.command('plot:avg-sentence-pca')
 def avg_sentence_pca():
-    filename = "avg_embedding_sentence_pca.png"
+    save_to = paths('figures') / "avg_embedding_sentence_pca.png"
    DB = connect()
-    data = DB.query("""
+    with connect() as db:
-        SELECT
+        data = db.query("""
-            pca.first
+            SELECT
-            ,pca.second
+                pca.first
-            ,p.tld
+                ,pca.second
-            ,b.bias as label
+                ,p.tld
-        FROM top.publisher_embeddings_pca pca
+                ,b.bias as label
-        JOIN top.publishers p
+            FROM top.publisher_embeddings_pca pca
-        ON p.id = pca.publisher_id
+            JOIN top.publishers p
-        JOIN top.publisher_bias pb
+            ON p.id = pca.publisher_id
-        ON pb.publisher_id = p.id
+            JOIN top.publisher_bias pb
-        JOIN bias_ratings b
+            ON pb.publisher_id = p.id
-        ON b.id = pb.bias_id
+            JOIN bias_ratings b
-    """).df()
+            ON b.id = pb.bias_id
-    DB.close()
+        """).df()
    ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
    ax.set(title="avg. publisher embedding pca components vs. bias label", xlabel="first component", ylabel="second component")
-    plt.savefig(out_path / filename)
+    plt.savefig(save_to)
@click.command('plot:sentence-confusion')
 def sentence_confusion():
@ -65,32 +60,31 @@ def sentence_confusion():
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.metrics import ConfusionMatrixDisplay
-    filename = "sentence_confusion.png"
+    save_to = paths('figures') / "sentence_confusion.png"
-    embeddings = np.load(data_path / 'embeddings.npy')
+    embeddings = np.load(paths('data') / 'embeddings.npy')
-    embedding_ids = np.load(data_path / 'embedding_ids.npy')
+    embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
    ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
-    DB = connect()
+    with connect() as db:
-    data = DB.query("""
+        data = db.query("""
-        SELECT
+            SELECT
-            ids.index
+                ids.index
-            ,s.id
+                ,s.id
-            ,b.ordinal
+                ,b.ordinal
-        FROM ids
+            FROM ids
-        JOIN top.stories s
+            JOIN top.stories s
-        ON ids.story_id = s.id
+            ON ids.story_id = s.id
-        JOIN top.publisher_bias pb
+            JOIN top.publisher_bias pb
-        ON pb.publisher_id = s.publisher_id
+            ON pb.publisher_id = s.publisher_id
-        JOIN bias_ratings b
+            JOIN bias_ratings b
-        ON b.id = pb.bias_id
+            ON b.id = pb.bias_id
-    """).df()
+        """).df()
-    pub = DB.query("""
+        pub = db.query("""
-        SELECT
+            SELECT
-            *
+                *
-        FROM top.publishers
+            FROM top.publishers
-    """).df()
+        """).df()
    DB.close()
    train, test = train_test_split(data)
    train_x, train_y = embeddings[train['index']], train['ordinal']
@ -105,7 +99,7 @@ def sentence_confusion():
    ConfusionMatrixDisplay.from_predictions(test_y, pred, ax=ax)
    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
    ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
-    plt.savefig(out_path / filename)
+    plt.savefig(save_to)
    plt.close()
-    print(f"saved plot: {filename}")
+    print(f"saved plot: {save_to}")
--- a/src/plots/sentiment.py
+++ b/src/plots/sentiment.py
@ -1,138 +1,135 @@
 import click
-from data.main import connect
+from data.main import connect, paths, ticklabels
 import os
 from pathlib import Path
 import seaborn as sns
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:sentiment-over-time')
 def over_time():
    filename = "sentiment_over_time.png"
-    DB = connect()
+    filename = "sentiment_over_time.png"
-    data = DB.sql("""
+    save_to = paths('figures') / filename
-        SELECT
+
-            avg(sent.class_id) as sentiment
+    with connect() as db:
-            ,s.published_at as date
+        data = db.sql("""
-        FROM top.story_sentiments sent
+            SELECT
-        JOIN top.stories s
+                avg(sent.class_id) as sentiment
-        ON s.id = sent.story_id
+                ,s.published_at as date
-        GROUP BY
+            FROM top.story_sentiments sent
-            s.published_at
+            JOIN top.stories s
-    """).df()
+            ON s.id = sent.story_id
-    DB.close()
+            GROUP BY
                s.published_at
        """).df()
    ax = sns.scatterplot(x=data['date'], y=data['sentiment'])
    ax.set(title="sentiment vs. time")
    plt.tight_layout()
-    plt.savefig(out_path / filename)
+    plt.savefig(save_to)
-    print(f"saved: {filename}")
+    plt.close()
    print(f"saved: {save_to}")
@click.command('plot:bias-vs-sentiment-over-time')
 def bias_over_time():
    """plot sentiment/bias vs. time"""
    filename = "bias_vs_sentiment_over_time.png"
    save_to = paths('figures') / filename
-    DB = connect()
+    with connect() as db:
-    data = DB.sql("""
+        data = db.sql("""
-        SELECT
+            with cte as (
-            avg(sent.class_id) as sentiment
+                SELECT
-            ,date_trunc('yearweek', s.published_at) as date
+                    avg(sent.class_id) as sentiment
-            --,b.ordinal as ordinal
+                    ,date_trunc('yearweek', s.published_at) as date
-            ,b.bias
+                    ,p.bias
-        FROM top.story_sentiments sent
+                FROM story_sentiments sent
-        JOIN top.stories s
+                JOIN stories s
-        ON s.id = sent.story_id
+                ON s.id = sent.story_id
-        JOIN publisher_bias pb
+                JOIN mbfc.publisher_stories ps
-        ON pb.publisher_id = s.publisher_id
+                ON ps.story_id = s.id
-        JOIN bias_ratings b
+                JOIN mbfc.publishers p
-        ON b.id = pb.bias_id
+                ON p.id = ps.publisher_id
-        GROUP BY
+                WHERE p.ordinal != -1
-            date_trunc('yearweek', s.published_at)
+                GROUP BY
-            ,b.bias
+                    date_trunc('yearweek', s.published_at)
-    """).df()
+                    ,p.bias
-    DB.close()
+            )
            SELECT
                median(sentiment) OVER (PARTITION BY bias ORDER BY date DESC ROWS BETWEEN 0 PRECEDING AND 7 FOLLOWING) as sentiment
                ,date
                ,bias
            FROM cte
            WHERE year(date) not in (2005, 2023)
        """).df()
-    order = ['left', 'left-center', 'center', 'right-center', 'right']
+    #ax = sns.relplot(data, x='date', y='sentiment', col='bias', palette='rainbow', hue='bias', col_order=ticklabels())
-    ax = sns.relplot(data, x='date', y='sentiment', col='bias', col_order=order)
+    ax = sns.lineplot(data, x='date', y='sentiment', palette='rainbow', hue='bias', hue_order=ticklabels())
    plt.axhline(y=0.5, color='black', linestyle='--', label='neutral') 
    ax.set(title='sentiment and bias vs. time', ylabel='8 week rolling avg. sentiment', xlabel='date')
    plt.tight_layout()
-    plt.savefig(out_path / filename)
+    plt.savefig(save_to)
    plt.close()
-    print(f"saved: {filename}")
+    print(f"saved: {save_to}")
@click.command('plot:sentiment-recent-winner')
 def bias_vs_recent_winner():
    """plot bias vs. distance to election"""
    filename = "bias_vs_recent_winner.png"
    save_to = paths('figures') / filename
-    DB = connect()
+    with connect() as db:
-    data = DB.sql("""
+        data = db.sql("""
-        SELECT
+            SELECT
-            e.days_away as days_away
+                round(e.days_away, -1) as days_away
-            ,b.ordinal
+                ,p.bias
-            ,avg(sent.class_id) as sentiment
+                ,avg(sent.class_id) as sentiment
-            ,count(1) as stories
+                ,count(1) as stories
-        FROM top.stories s
+            FROM stories s
-        JOIN top.story_sentiments sent
+            JOIN story_sentiments sent
-        ON s.id = sent.story_id
+            ON s.id = sent.story_id
-        JOIN election_distance e
+            JOIN election_distance e
-        ON e.publish_date = s.published_at
+            ON e.publish_date = s.published_at
-        JOIN publisher_bias pb
+            JOIN mbfc.publisher_stories ps
-        ON pb.publisher_id = s.publisher_id
+            ON ps.story_id = s.id
-        JOIN bias_ratings b
+            JOIN mbfc.publishers p
-        ON b.id = pb.bias_id
+            ON p.id = ps.publisher_id
-        GROUP BY
+            GROUP BY
-            e.days_away
+                round(e.days_away, -1)
-            ,b.ordinal
+                ,p.bias
-    """).df()
+        """).df()
    DB.close()
    data
-    ax = sns.scatterplot(x=data['days_away'], y=data['sentiment'], hue=data['ordinal'])
+    ax = sns.scatterplot(data, x='days_away', y='sentiment', hue='bias', hue_order=ticklabels(), palette='rainbow')
    ax.set(title="sentiment vs. days to nearest election", xlabel="days to nearest election", ylabel="average title seniment")
    plt.tight_layout()
-    plt.savefig(out_path / filename)
+    plt.savefig(save_to)
    plt.close()
-
+    print(f"saved: {save_to}")
    print(f"saved: {filename}")
@click.command('plot:sentiment-hist')
 def sentiment_hist():
    filename = "sentiment_hist.png"
    save_to = paths('figures') / filename
-    DB = connect()
+    with connect() as db:
        data = db.sql("""
            SELECT
                p.bias
                ,count(1) as stories
            FROM stories s
            JOIN mbfc.publisher_stories ps
            ON ps.story_id = s.id
            JOIN mbfc.publishers p
            ON p.id = ps.publisher_id
            WHERE p.ordinal != -1
            GROUP BY
                p.bias
        """).df()
-    DB.query("""
+    ax = sns.barplot(data, x='bias', y='stories', hue='bias', palette='rainbow')
-        select
+    ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
            sent.label
            ,count(distinct s.id) as stories
            ,count(distinct s.publisher_id) as publishers
        from top.story_sentiments sent
        join top.stories s
        on s.id = sent.story_id
        group by
            sent.label
    """).df().to_markdown(index=False)
    data = DB.sql("""
        SELECT
            b.ordinal
            ,count(1) as stories
        FROM stories s
        JOIN publisher_bias pb
        ON pb.publisher_id = s.publisher_id
        JOIN bias_ratings b
        ON b.id = pb.bias_id
        GROUP BY
            b.ordinal
    """).df()
    DB.close()
    ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
    ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
    plt.tight_layout()
-    plt.savefig(out_path / filename)
+    plt.savefig(save_to)
-    print(f"saved: {filename}")
+    plt.close()
    print(f"saved: {save_to}")
--- a/src/selection.py
+++ b/src/selection.py
@ -1,48 +0,0 @@
 from data.main import connect
 import pandas as pd
 import numpy as np
 DB = connect()
 edges = DB.query("""
    select
    *
    from link_edges
 """).df()
 DB.close()
 edges
 adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
 select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
 DB = connect()
 DB.query("create schema top")
 DB.query("""
    CREATE OR REPLACE TABLE top.publishers AS
    SELECT
        p.*
    FROM publishers p
    JOIN select_publishers s
    ON s.publisher_id = p.id
 """)
 DB.query("""
    CREATE OR REPLACE TABLE top.stories AS
    SELECT
        s.*
    FROM stories s
    JOIN top.publishers p
    ON s.publisher_id = p.id
    WHERE year(s.published_at) >= 2006
    AND year(s.published_at) < 2023
 """)
 DB.query("""
    CREATE OR REPLACE TABLE top.related_stories AS
    SELECT
        r.*
    FROM top.stories s
    JOIN related_stories r
    ON s.id = r.parent_id
 """)
--- a/src/sentence.py
+++ b/src/sentence.py
@ -1,7 +1,7 @@
 from transformers import AutoTokenizer, AutoModel
 import torch
 import torch.nn.functional as F
-from data.main import connect, data_dir
+from data.main import connect, paths
 import os
 from pathlib import Path
 import numpy as np
@ -62,7 +62,7 @@ def embed(chunks):
    ids = np.concatenate(embedding_ids)
    # save embeddings
-    save_to = data_dir() / 'embeddings.npy'
+    save_to = paths('data') / 'embeddings.npy'
    np.save(save_to, embeddings)
    print(f"embeddings saved: {save_to}")
@ -75,29 +75,28 @@ def embed(chunks):
@click.command('sentence:create-avg-pca-table')
 def create_avg_pca_table():
    from sklearn.decomposition import PCA
    data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
-    embeddings = np.load(data_path / 'embeddings.npy')
+    embeddings = np.load(paths('data') / 'embeddings.npy')
-    embedding_ids = np.load(data_path / 'embedding_ids.npy')
+    embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
    ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
-    DB = connect()
+    
-    data = DB.query("""
+    with connect() as db:
-        SELECT
+        data = db.query("""
-            ids.index
+            SELECT
-            ,s.id
+                ids.index
-            ,s.publisher_id
+                ,s.id
-            ,b.ordinal
+                ,s.publisher_id
-        FROM ids
+                ,b.ordinal
-        JOIN top.stories s
+            FROM ids
-        ON ids.story_id = s.id
+            JOIN top.stories s
-        JOIN top.publisher_bias pb
+            ON ids.story_id = s.id
-        ON pb.publisher_id = s.publisher_id
+            JOIN top.publisher_bias pb
-        JOIN bias_ratings b
+            ON pb.publisher_id = s.publisher_id
-        ON b.id = pb.bias_id
+            JOIN bias_ratings b
-    """).df()
+            ON b.id = pb.bias_id
-    DB.close()
+        """).df()
    results = []
    for publisher_id, group in data.groupby(['publisher_id']):
@ -115,47 +114,45 @@ def create_avg_pca_table():
    results['second'] = pred[:, 1]
    table_name = "top.publisher_embeddings_pca"
-    DB = connect()
+    with connect() as db:
-    DB.query(f"""
+        db.query(f"""
-        CREATE OR REPLACE TABLE {table_name} AS
+            CREATE OR REPLACE TABLE {table_name} AS
-        SELECT
+            SELECT
-            results.publisher_id as publisher_id
+                results.publisher_id as publisher_id
-            ,results.first as first
+                ,results.first as first
-            ,results.second as second
+                ,results.second as second
-        FROM results
+            FROM results
-    """)
+        """)
-    DB.close()
+
    print(f"created {table_name}")
@click.command('sentence:create-pca-table')
 def create_pca_table():
    from sklearn.decomposition import PCA
    data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
-    embeddings = np.load(data_path / 'embeddings.npy')
+    embeddings = np.load(path('data') / 'embeddings.npy')
-    embedding_ids = np.load(data_path / 'embedding_ids.npy')
+    embedding_ids = np.load(path('data') / 'embedding_ids.npy')
-    DB = connect()
+    with connect() as db:
-    data = DB.query("""
+        data = db.query("""
-        SELECT
+            SELECT
-            ids.index
+                ids.index
-            ,s.id
+                ,s.id
-            ,b.ordinal
+                ,b.ordinal
-        FROM ids
+            FROM ids
-        JOIN top.stories s
+            JOIN top.stories s
-        ON ids.story_id = s.id
+            ON ids.story_id = s.id
-        JOIN top.publisher_bias pb
+            JOIN top.publisher_bias pb
-        ON pb.publisher_id = s.publisher_id
+            ON pb.publisher_id = s.publisher_id
-        JOIN bias_ratings b
+            JOIN bias_ratings b
-        ON b.id = pb.bias_id
+            ON b.id = pb.bias_id
-    """).df()
+        """).df()
-    pub = DB.query("""
+        pub = db.query("""
-        SELECT
+            SELECT
-            *
+                *
-        FROM top.publishers
+            FROM top.publishers
-    """).df()
+        """).df()
    DB.close()
    x = embeddings[data['index']]
    y = data['ordinal'].to_numpy().reshape(-1, 1)
@ -166,42 +163,41 @@ def create_pca_table():
    table_name = f"top.story_embeddings_pca"
-    DB = connect()
+    with connect() as db:
-    DB.query(f"""
+        db.query(f"""
-        CREATE OR REPLACE TABLE {table_name} AS
+            CREATE OR REPLACE TABLE {table_name} AS
-        SELECT
+            SELECT
-            data.id as story_id
+                data.id as story_id
-            ,data.first as first
+                ,data.first as first
-            ,data.second as second
+                ,data.second as second
-        FROM data
+            FROM data
-    """)
+        """)
-    DB.close()
+
    print(f"created {table_name}")
@click.command('sentence:create-svm-table')
 def create_svm_table():
    from sklearn import svm
    from sklearn.linear_model import SGDClassifier
    data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
-    embeddings = np.load(data_path / 'embeddings.npy')
+    embeddings = np.load(paths('data') / 'embeddings.npy')
-    embedding_ids = np.load(data_path / 'embedding_ids.npy')
+    embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
    ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
-    DB = connect()
+    with connect() as db:
-    data = DB.query("""
+        data = db.query("""
-        SELECT
+            SELECT
-            ids.index
+                ids.index
-            ,s.id
+                ,s.id
-            ,b.ordinal
+                ,b.ordinal
-        FROM ids
+            FROM ids
-        JOIN top.stories s
+            JOIN top.stories s
-        ON ids.story_id = s.id
+            ON ids.story_id = s.id
-        JOIN top.publisher_bias pb
+            JOIN top.publisher_bias pb
-        ON pb.publisher_id = s.publisher_id
+            ON pb.publisher_id = s.publisher_id
-        JOIN bias_ratings b
+            JOIN bias_ratings b
-        ON b.id = pb.bias_id
+            ON b.id = pb.bias_id
-    """).df()
+        """).df()
    x = embeddings[data['index']]
    #y = data['ordinal'].to_numpy().reshape(-1, 1)