Merge branch 'feature_factcheck'

2023-06-01 09:44:28 -07:00 · 2023-06-01 09:44:28 -07:00 · 81f4f37c9d
parent 398228f02c 79808f01d6
commit 81f4f37c9d
40 changed files with 1354 additions and 1137 deletions
--- a/docs/figures/bias_hist.png
+++ b/docs/figures/bias_hist.png
--- a/docs/figures/bias_over_time.png
+++ b/docs/figures/bias_over_time.png
--- a/docs/figures/bias_publisher_hist.png
+++ b/docs/figures/bias_publisher_hist.png
--- a/docs/figures/bias_vs_recent_winner.png
+++ b/docs/figures/bias_vs_recent_winner.png
--- a/docs/figures/bias_vs_sentiment_over_time.png
+++ b/docs/figures/bias_vs_sentiment_over_time.png
--- a/docs/figures/emotion_hist.png
+++ b/docs/figures/emotion_hist.png
--- a/docs/figures/emotion_over_time.png
+++ b/docs/figures/emotion_over_time.png
--- a/docs/figures/emotion_regression.png
+++ b/docs/figures/emotion_regression.png
--- a/docs/figures/link_links_pca_with_classes.png
+++ b/docs/figures/link_links_pca_with_classes.png
--- a/docs/figures/link_normalized_pca_with_classes.png
+++ b/docs/figures/link_normalized_pca_with_classes.png
--- a/docs/figures/link_onehot_pca_with_classes.png
+++ b/docs/figures/link_onehot_pca_with_classes.png
--- a/docs/figures/link_{source}_pca_with_classes.png
+++ b/docs/figures/link_{source}_pca_with_classes.png
--- a/docs/figures/pca_with_classes.png
+++ b/docs/figures/pca_with_classes.png
--- a/docs/figures/sentiment_hist.png
+++ b/docs/figures/sentiment_hist.png
--- a/docs/figures/sentiment_over_time.png
+++ b/docs/figures/sentiment_over_time.png
--- a/docs/figures/stories_per_publisher.png
+++ b/docs/figures/stories_per_publisher.png
--- a/docs/figures/top_publishers.png
+++ b/docs/figures/top_publishers.png
--- a/src/apriori.py
+++ b/src/apriori.py
@ -0,0 +1,27 @@
+from efficient_apriori import apriori
+from data.main import connect
+
+@click.command("apriori:rules")
+def rules():
+    DB = connect()
+    data = DB.query("""
+        SELECT
+            --list_prepend(parent.id, list(child.id)) as transaction
+            list_prepend(parent.tld, list(child.tld)) as transaction
+        FROM stories s
+        JOIN related_stories r
+        ON r.parent_id = s.id
+        JOIN publishers parent
+        ON parent.id = s.publisher_id
+        JOIN publishers child
+        ON child.id = r.publisher_id
+        GROUP BY
+            --parent.id
+            parent.tld
+    """).df()
+    DB.close()
+
+    transactions = data.transaction.apply(lambda x: tuple(x)).values
+
+    itemsets, rules = apriori(transactions, min_support=0.1, min_confidence=0.8)
+    print(*rules, sep="\n")
--- a/src/bias.py
+++ b/src/bias.py
@ -1,67 +1,42 @@
 import click
-from data.main import connect
+from data.main import connect, paths
 import pandas as pd
 from lxml import etree
 from pathlib import Path
 import os
 import csv

-def label_to_int(rating:str) -> int:
-
-    mapping = {
-        'left' : 0,
-        'left-center' : 1,
-        'center' : 2,
-        'right-center' : 3,
-        'right' : 4,
-        'allsides' : -1,
-    }
-
-    return mapping[rating]
-
-def int_to_label(class_id: int) -> str:
-    mapping = {
-            0 : 'left',
-            1 : 'left-center',
-            2 : 'center',
-            3 : 'right-center',
-            4 : 'right',
-            -1 : 'allsides',
-    }
-    return mapping[class_id]
-

@click.command(name="bias:normalize")
 def normalize() -> None:
-    DB = connect()
-
-    DB.sql("""
-        CREATE OR REPLACE TABLE publisher_bias AS
-        WITH cte AS (
-            SELECT
-                p.id as publisher_id
-                ,b.id as bias_id
-                ,b.bias as label
-                ,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity
-            FROM bias_ratings b
-            JOIN top.publishers p
-            ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95
-        ),ranked AS (
+    with connect() as db:
+        db.sql("""
+            CREATE OR REPLACE TABLE publisher_bias AS
+            WITH cte AS (
+                SELECT
+                    p.id as publisher_id
+                    ,b.id as bias_id
+                    ,b.bias as label
+                    ,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity
+                FROM bias_ratings b
+                JOIN top.publishers p
+                ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95
+            ),ranked AS (
+                SELECT
+                    publisher_id
+                    ,bias_id
+                    ,label
+                    ,similarity
+                    ,ROW_NUMBER() OVER(PARTITION BY publisher_id ORDER BY similarity DESC) AS rn
+                FROM cte
+            )
            SELECT
                publisher_id
-                ,bias_id
                ,label
-                ,similarity
-                ,ROW_NUMBER() OVER(PARTITION BY publisher_id ORDER BY similarity DESC) AS rn
-            FROM cte
-        )
-        SELECT
-            publisher_id
-            ,label
-            ,bias_id
-        FROM ranked
-        WHERE ranked.rn = 1
-    """)
+                ,bias_id
+            FROM ranked
+            WHERE ranked.rn = 1
+        """)

    mapping = [
            {'label' :'left' , 'ordinal': -2},
@ -72,22 +47,20 @@ def normalize() -> None:
    ]
    mapping = pd.DataFrame(mapping)

-    DB.query("alter table bias_ratings add column ordinal int")
-
-    DB.query("""
-        update bias_ratings b
-        set ordinal = o.ordinal
-        FROM mapping o
-        WHERE o.label = b.bias
-    """)
+    with connect() as db:
+        db.query("alter table bias_ratings add column ordinal int")
+        db.query("""
+            update bias_ratings b
+            set ordinal = o.ordinal
+            FROM mapping o
+            WHERE o.label = b.bias
+        """)


@click.command(name='bias:parse')
 def parse() -> None:
    """parse the save html page of allslides.com bias ratings into a normalized csv file"""
-    DB = connect()
-    DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
-    bias_html = DATA_DIR / 'allsides.html'
+    bias_html = paths('data') / 'allsides.html'

    parser = etree.HTMLParser()
    tree = etree.parse(str(bias_html), parser)
@ -111,65 +84,63 @@ def parse() -> None:
        rating['disagree'] = int(disagree)
        ratings.append(rating)
    df = pd.DataFrame(ratings)
-    df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
+    df.to_csv(paths('data') / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)

@click.command(name="bias:load")
 def load() -> None:
-    DB = connect()
-    DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
-    f = str(DATA_DIR / "bias_ratings.csv")
+    f = str(paths('data') / "bias_ratings.csv")

-    DB.sql(f"""
-        CREATE TABLE bias_ratings as 
-        select 
-            row_number() over(order by b.publisher) as id
-            ,b.*
-        from read_csv_auto('{f}') b
-    """)
+    with connect() as db:
+        db.sql(f"""
+            CREATE TABLE bias_ratings as 
+            select 
+                row_number() over(order by b.publisher) as id
+                ,b.*
+            from read_csv_auto('{f}') b
+        """)

@click.command('bias:export')
 def export():
-    data_path = Path(os.environ['DATA_MINING_DATA_DIR'])
+    with connect() as db:
+        all_bias = db.query("""
+            SELECT
+                id as bias_id
+                ,publisher as name
+                ,bias as label
+            FROM bias_ratings
+            ORDER by agree desc
+        """)

-    DB = connect()
-    all_bias = DB.query("""
-        SELECT
-            id as bias_id
-            ,publisher as name
-            ,bias as label
-        FROM bias_ratings
-        ORDER by agree desc
+    all_bias.df().to_csv(paths('data') / 'TMP_publisher_bias.csv', sep="|", index=False)
+    with connect() as db:
+        mapped_bias = db.query("""
+            SELECT
+                p.id as publisher_id
+                ,p.name as name
+                ,p.tld as tld
+                ,b.label as bias
+                ,b.bias_id as bias_id
+            FROM top.publishers p
+            LEFT JOIN publisher_bias b
+            ON b.publisher_id = p.id
    """)
-    all_bias.df().to_csv(data_path / 'TMP_publisher_bias.csv', sep="|", index=False)
-    mapped_bias = DB.query("""
-        SELECT
-            p.id as publisher_id
-            ,p.name as name
-            ,p.tld as tld
-            ,b.label as bias
-            ,b.bias_id as bias_id
-        FROM top.publishers p
-        LEFT JOIN publisher_bias b
-        ON b.publisher_id = p.id
-    """)
-    mapped_bias.df().to_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|", index=False)
-    DB.close()
+    mapped_bias.df().to_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|", index=False)

@click.command('bias:import-mapped')
 def import_mapped():
-    data_path = Path(os.environ['DATA_MINING_DATA_DIR'])
    table_name = "top.publisher_bias"

-    DB = connect()
-    df = pd.read_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|")
+    df = pd.read_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|")
+
+    with connect() as db:
+        db.query(f"""
+            CREATE OR REPLACE TABLE {table_name} AS
+            SELECT
+                publisher_id AS publisher_id
+                ,cast(bias_id AS int) as bias_id
+            FROM df
+            WHERE bias_id IS NOT NULL
+        """)

-    DB.query(f"""
-        CREATE OR REPLACE TABLE {table_name} AS
-        SELECT
-            publisher_id AS publisher_id
-            ,cast(bias_id AS int) as bias_id
-        FROM df
-        WHERE bias_id IS NOT NULL
-    """)
    print(f"created table: {table_name}")

--- a/src/cli.py
+++ b/src/cli.py
@ -1,5 +1,7 @@
 import click
 from dotenv import load_dotenv
+import data
+import plots

@click.group()
 def cli():
@ -7,12 +9,20 @@ def cli():

 if __name__ == "__main__":
    load_dotenv()
-    from data import scrape
-    cli.add_command(scrape.download)
-    cli.add_command(scrape.parse)
-    cli.add_command(scrape.load)
-    cli.add_command(scrape.normalize)
-    cli.add_command(scrape.create_elections_table)
+
+    # original bias ratings
+    cli.add_command(data.scrape.download)
+    cli.add_command(data.scrape.parse)
+    cli.add_command(data.scrape.load)
+    cli.add_command(data.scrape.normalize)
+    cli.add_command(data.scrape.create_elections_table)
+
+    cli.add_command(data.factcheck.parse_index)
+    cli.add_command(data.factcheck.scrape)
+
+    cli.add_command(data.links.create_table)
+    cli.add_command(data.links.create_pca)
+    cli.add_command(data.links.create_clusters)

    import word
    # cli.add_command(word.distance)
@ -23,10 +33,12 @@ if __name__ == "__main__":
    cli.add_command(bias.parse)
    cli.add_command(bias.load)
    cli.add_command(bias.normalize)
+
    import mine
    cli.add_command(mine.embeddings)
    cli.add_command(mine.cluster)
    cli.add_command(mine.plot)
+
    import emotion
    cli.add_command(emotion.extract)
    cli.add_command(emotion.normalize)
@ -40,34 +52,20 @@ if __name__ == "__main__":
    from train import main as train_main
    cli.add_command(train_main.main)

-    import plots.descriptive as plotd
-    cli.add_command(plotd.articles_per_year)
-    cli.add_command(plotd.distinct_publishers)
-    cli.add_command(plotd.stories_per_publisher)
-    cli.add_command(plotd.top_publishers)
-    cli.add_command(plotd.common_tld)
-
-    import  links as linkcli
-    cli.add_command(linkcli.create_table)
-    cli.add_command(linkcli.create_pca)
-    cli.add_command(linkcli.create_clusters)
-
-    import  plots.links as plotl
-    cli.add_command(plotl.elbow)
-    cli.add_command(plotl.link_pca_clusters)
-
-    import  plots.classifier as plotc
-    cli.add_command(plotc.pca_with_classes)
-
-    import plots
+    cli.add_command(plots.descriptive.articles_per_year)
+    cli.add_command(plots.descriptive.distinct_publishers)
+    cli.add_command(plots.descriptive.stories_per_publisher)
+    cli.add_command(plots.descriptive.top_publishers)
+    cli.add_command(plots.descriptive.common_tld)
    cli.add_command(plots.sentence.sentence_pca)
    cli.add_command(plots.sentence.avg_sentence_pca)
    cli.add_command(plots.emotion.emotion_over_time)
    cli.add_command(plots.emotion.emotion_regression)
-
    cli.add_command(plots.sentiment.over_time)
    cli.add_command(plots.sentiment.bias_over_time)
    cli.add_command(plots.sentiment.bias_vs_recent_winner)
-
+    cli.add_command(plots.links.elbow)
+    cli.add_command(plots.links.link_pca_clusters)
+    cli.add_command(plots.classifier.pca_with_classes)

    cli()
--- a/src/data/init.py
+++ b/src/data/init.py
@ -1,6 +1,10 @@
 import data.main
 import data.scrape
+import data.factcheck
+import data.links
 __all__ = [
    'main'
    ,'scrape'
+    ,'factcheck'
+    ,'links'
 ]
--- a/src/data/factcheck.py
+++ b/src/data/factcheck.py
@ -0,0 +1,171 @@
+import requests
+from lxml import etree
+from bs4 import BeautifulSoup
+import re
+from io import BytesIO
+import pandas as pd
+from pathlib import Path
+import os
+import sys
+import click
+from data.main import connect, map_tld, paths
+from random import randint
+from time import sleep
+from tqdm import tqdm
+
+
+@click.command('mbfc:parse-index')
+def parse_index():
+    parser = etree.HTMLParser()
+    publishers = []
+    for page in range(1, 54):
+        url = f"https://mediabiasfactcheck.com/filtered-search/?pg={page}"
+        print(f"downloading {url}", file=sys.stderr)
+        response = requests.get(url)
+        html = response.content
+        tree = etree.parse(BytesIO(html), parser)
+        rows = tree.xpath('//table[@class="mbfc-table"]/tbody/tr')
+        print(f"parsing {len(rows)} rows", file=sys.stderr)
+        for row in rows:
+            publisher = {}
+            link, bias, reporting, country, credibility, media_type, traffic, popularity = tuple(col for col in row.iterchildren())
+            link = link.xpath('./a')[0]
+            publisher['name'] = link.text
+            publisher['detail_url'] = link.get('href')
+            publisher['bias'] = bias.text
+            publisher['reporting'] = reporting.text
+            publisher['country'] = country.text
+            publisher['credibility'] = credibility.text
+            publisher['media_type'] = media_type.text
+            publisher['traffic'] = traffic.text
+            publisher['popularity'] = popularity.xpath('./span')[0].text
+            publishers.append(publisher)
+    df = pd.DataFrame(publishers)
+    save_to = paths('data') / 'mbfc_bias.csv'
+    df.to_csv(save_to, sep='|', index=False)
+    print(f"saved {len(df)}: {save_to}", file=sys.stderr)
+
+@click.command("mbfc:schema")
+def schema():
+    with connect() as db:
+        db.sql("""create schema mbfc""")
+        db.sql("""create or replace table mbfc.scrape (
+            url text
+            ,scraped_at datetime default now()
+            )
+        """)
+
+@click.command("mbfc:scrape")
+def scrape():
+
+    df = pd.read_csv(paths('data') / 'mbfc_bias.csv', sep="|")
+
+    with connect() as db:
+        stats  = db.query("""
+            select
+                count(1) filter(where s.url is not null) as elapsed
+                ,count(1) filter(where s.url is null) as remaining
+            from df
+            left join mbfc.scrape s
+            on df.detail_url = s.url
+        """).fetchall()
+        df = db.query("""
+            select
+                detail_url as url
+            from df
+            where df.detail_url not in (
+                select
+                    url
+                from mbfc.scrape
+            )
+        """).df()
+    print(f"{stats[0][0]} elapsed. {stats[0][1]} remaining.")
+
+    for url in df.url:
+        delay = randint(1,3)
+        save_as = paths('data') / 'mbfc' / (url.strip('/').split('/')[-1] + '.html')
+        print(f"downloading (delay: {delay}): {url}", file=sys.stderr)
+        sleep(delay)
+        try:
+            response = requests.get(url)
+        except Exception as e:
+            print(f"request failed: {url}", file=sys.stderr)
+            continue
+        with open(save_as, 'w') as f:
+            f.write(response.text)
+        with connect() as db:
+            db.execute("""insert into mbfc.scrape (url) values (?)""", [url])
+        print(f"saved: {save_as}", file=sys.stderr)
+
+def load():
+
+    publishers = []
+    for i, page in enumerate(tqdm((paths('data') / 'mbfc').iterdir())):
+        publisher = {}
+        publisher['origin_url'] = f"https://mediabiasfactcheck.com/{page.stem}"
+        with page.open() as p:
+            tree = BeautifulSoup(p, 'html.parser')
+        for e in tree(string=re.compile(r'source:', re.IGNORECASE)):
+            e = e.parent
+            while e.name != 'p':
+                e = e.parent
+            l = e.find('a')
+            if l:
+                publisher['tld'] = l.get('href')
+                break
+            else:
+                breakpoint()
+        publishers.append(publisher)
+    df = pd.DataFrame(publishers)
+    df.to_csv(paths('data') / 'mbfc_publisher_url.csv', index=False, sep="|")
+
+@click.command('mbfc:create-tables')
+def create_tables():
+
+    pubs = pd.read_csv(paths('data') / 'mbfc_publishers.csv', sep='|')
+    urls = pd.read_csv(paths('data') / 'mbfc_publisher_url.csv', sep="|")
+    df = pubs.merge(urls, on='mbfc_url')
+    df['tld'] = df.tld.apply(map_tld)
+    df['ordinal'] = df.bias.apply(bias_label_to_int)
+
+    with connect() as db:
+        db.sql("""
+            CREATE OR REPLACE TABLE mbfc.publishers AS
+            SELECT
+                row_number() over() as id
+                ,p.tld
+                ,mode(p.name) as name
+                ,mode(p.bias) as bias
+                ,mode(p.ordinal) as ordinal
+                ,mode(p.reporting) as reporting
+                ,mode(p.country) as country
+                ,mode(p.credibility) as credibility
+                ,mode(p.media_type) as media_type
+                ,mode(p.traffic) as traffic
+                ,mode(p.popularity) as popularity
+            FROM df p
+            GROUP BY
+                p.tld
+        """)
+
+    with connect() as db:
+        raw_stories = db.sql("""
+            SELECT
+                *
+            FROM stories s
+        """).df()
+
+    stories['tld'] = stories.url.apply(map_tld)
+    
+    with connect() as db:
+        db.sql("""
+            CREATE OR REPLACE TABLE mbfc.publisher_stories AS
+            SELECT
+               s.id as story_id
+               ,p.id as publisher_id
+            FROM raw_stories s
+            JOIN mbfc.publishers p
+            ON p.tld = s.tld
+        """)
+
+
--- a/src/data/links.py
+++ b/src/data/links.py
@ -0,0 +1,135 @@
+import click
+from data.main import connect
+import pandas as pd
+
+@click.command('links:create-table')
+def create_table():
+
+    with connect() as db:
+        db.query(f"""
+            CREATE OR REPLACE TABLE link_edges AS
+            with cte as(
+                SELECT 
+                    s.publisher_id as parent_id
+                    ,r.publisher_id as child_id
+                    ,count(1) as links
+                FROM stories s
+                JOIN related_stories r
+                ON s.id = r.parent_id
+                group by 
+                    s.publisher_id
+                    ,r.publisher_id
+            )
+            SELECT
+                cte.parent_id
+                ,cte.child_id
+                ,cte.links as links
+                ,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
+                ,case when cte.links > 0 then 1 else 0 end as onehot
+            FROM cte
+            WHERE cte.child_id in (
+                SELECT
+                    distinct parent_id
+                FROM cte
+            )
+            AND cte.parent_id in (
+                SELECT
+                    distinct child_id
+                FROM cte
+            )
+        """)
+
+        db.query("""
+            SELECT
+            *
+            ,count(1) over()
+            FROM link_edges e
+            limit 1
+        """)
+
+    print(f"created link_edges")
+
+@click.command('links:create-pca')
+@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
+def create_pca(source):
+    """create 2D pca labels"""
+    from sklearn.decomposition import PCA
+
+    table_name = f"publisher_pca_{source}"
+
+    with connect() as db:
+        pub = db.query("""
+            SELECT
+                p.*
+            FROM mbfc.publishers p
+            JOIN mbfc.publisher_stories ps
+            ON p.id = ps.publisher_id
+        """).df()
+        df = db.query(f"""
+            SELECT
+                parent_id
+                ,child_id
+                ,{source} as links
+            FROM link_edges
+        """).df()
+
+    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
+    svd = PCA(n_components=2)
+    svd_out = svd.fit_transform(pivot)
+    out = pivot.reset_index()[['parent_id']]
+    out['first'] = svd_out[:, 0]
+    out['second'] = svd_out[:, 1]
+    out = pd.merge(out, pub, left_on='parent_id', right_on='id')
+
+    with connect() as db:
+        db.query(f"""
+            CREATE OR REPLACE TABLE {table_name} AS
+            SELECT
+                out.id as publisher_id
+                ,out.first as first
+                ,out.second as second
+            FROM out
+        """)
+
+    print(f"created {table_name}")
+
+
+@click.command('links:create-clusters')
+@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
+def create_clusters(source):
+    """create link adj. matrix clusters table"""
+    from sklearn.cluster import KMeans
+
+    table_name = f"publisher_clusters_{source}"
+    with connect() as db:
+        df = db.query(f"""
+            SELECT
+                parent_id
+                ,child_id
+                ,{source} as links
+            FROM link_edges
+        """).df()
+        pub = db.query("""
+            SELECT
+                p.*
+            FROM mbfc.publishers p
+            JOIN mbfc.publisher_stories ps
+            ON ps.publisher_id = p.id
+        """).df()
+    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
+    k = 8
+    kmeans = KMeans(n_clusters=k, n_init="auto")
+    pred = kmeans.fit_predict(pivot)
+    out = pivot.reset_index()[['parent_id']]
+    out['label'] = pred
+    out = pd.merge(out, pub, left_on='parent_id', right_on='id')
+    new_table = out[['id', 'label']]
+    with connect() as db:
+        db.query(f"""
+            CREATE OR REPLACE TABLE {table_name} AS
+            SELECT
+                n.id as publisher_id
+                ,n.label as label
+            FROM new_table n
+        """)
+    print(f"created {table_name}")
--- a/src/data/main.py
+++ b/src/data/main.py
@ -2,6 +2,10 @@ import os
 from pathlib import Path
 import duckdb
 from enum import Enum
+from urllib.parse import urlparse
+from tld import get_tld
+from tld.utils import update_tld_names
+import sys

 class Data(str, Enum):
    Titles = 'titles'
@ -9,6 +13,16 @@ class Data(str, Enum):
 def data_dir():
    return Path(os.environ['DATA_MINING_DATA_DIR'])

+def paths(name='app'):
+    if 'app' in name:
+        return Path(os.environ['DATA_MINING_APP_DIR'])
+    if 'data' in name:
+        return Path(os.environ['DATA_MINING_DATA_DIR'])
+    if 'doc' in name:
+        return Path(os.environ['DATA_MINING_DOCS_DIR'])
+    if 'figure' in name:
+        return Path(os.environ['DATA_MINING_DOCS_DIR']) / 'figures'
+
 def connect():
    DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
    # APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
@ -28,3 +42,66 @@ def from_db(t: Data):
            limit 100
        """).df()
    return table
+
+def map_tld(x):
+    try:
+        res = get_tld(x, as_object=True)
+        return res.fld
+    except:
+        print(f"'{x}' is not valid.", file=sys.stderr)
+        return None
+
+def ticklabels():
+    return [ 
+            'Left',
+            'Left-Center',
+            'Least Biased',
+            'Right-Center',
+            'Right',
+        ]
+
+def bias_label_to_int(rating:str, source: str = 'mbfc') -> int:
+    if source == 'mbfc':
+        mapping = {
+            'Left' : 0,
+            'Left-Center' : 1,
+            'Least Biased' : 2,
+            'Right-Center' : 3,
+            'Right' : 4,
+        }
+    else:
+        mapping = {
+            'left' : 0,
+            'left-center' : 1,
+            'center' : 2,
+            'right-center' : 3,
+            'right' : 4,
+        }
+    try:
+        return mapping[rating]
+    except:
+        print(f"no mapping for {rating}", file=sys.stderr)
+        return -1
+
+def bias_int_to_label(class_id: int, source: str = 'mbfc') -> str:
+    if source == 'mbfc':
+        mapping = {
+            0 : 'Left',
+            1 : 'Left-Center',
+            2 : 'Least Biased',
+            3 : 'Right-Center',
+            4 : 'Right',
+        }
+    else:
+        mapping = {
+                0 : 'left',
+                1 : 'left-center',
+                2 : 'center',
+                3 : 'right-center',
+                4 : 'right',
+        }
+    try:
+        return mapping[class_id]
+    except:
+        print(f"no mapping for {class_id}", file=sys.stderr)
+        return -1
--- a/src/data/scrape.py
+++ b/src/data/scrape.py
@ -319,12 +319,6 @@ def another_norm():
    """)


-    def map_tld(x):
-        try:
-            res = get_tld(x, as_object=True)
-            return res.fld
-        except:
-            return None

    DB.sql("""
        SELECT
--- a/src/data/selection.py
+++ b/src/data/selection.py
@ -0,0 +1,47 @@
+from data.main import connect
+import pandas as pd
+import numpy as np
+
+def create_tables():
+
+    with connect() as db:
+        edges = db.query("""
+            select
+            *
+            from link_edges
+        """).df()
+
+    adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
+    select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
+
+    with connect() as db:
+        db.query("create schema top")
+
+        db.query("""
+            CREATE OR REPLACE TABLE top.publishers AS
+            SELECT
+                p.*
+            FROM publishers p
+            JOIN select_publishers s
+            ON s.publisher_id = p.id
+        """)
+
+        db.query("""
+            CREATE OR REPLACE TABLE top.stories AS
+            SELECT
+                s.*
+            FROM stories s
+            JOIN top.publishers p
+            ON s.publisher_id = p.id
+            WHERE year(s.published_at) >= 2006
+            AND year(s.published_at) < 2023
+        """)
+
+        db.query("""
+            CREATE OR REPLACE TABLE top.related_stories AS
+            SELECT
+                r.*
+            FROM top.stories s
+            JOIN related_stories r
+            ON s.id = r.parent_id
+        """)
--- a/src/data/sentiment.py
+++ b/src/data/sentiment.py
@ -1,10 +1,11 @@
+import click
 from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
 import torch
 import torch.nn.functional as F
-from data import connect, data_dir
+from data.main import connect, paths
 import numpy as np
 from tqdm import tqdm
-import click
+import pandas as pd

@click.option('-c', '--chunks', type=int, default=500, show_default=True)
@click.command("sentiment:extract")
@ -67,20 +68,19 @@ def extract(chunks):
@click.command('sentiment:load')
 def load():

-    DB = connect()
-    sentiments = np.load(data_dir() / 'sentiment.npy')
-    story_ids = np.load(data_dir() / 'sentiment_ids.npy')
+    sentiments = np.load(paths('data') / 'sentiment.npy')
+    story_ids = np.load(paths('data') / 'sentiment_ids.npy')
    data = pd.DataFrame(story_ids, columns=['story_id']).reset_index()
    data['sentiment_id'] = sentiments

-    DB.query("""
-        CREATE OR REPLACE TABLE top.story_sentiments AS
-        SELECT
-            data.story_id
-            ,data.sentiment_id as class_id
-            ,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label
-        FROM data
-        JOIN top.stories s
-        ON s.id = data.story_id
-    """)
-    DB.close()
+    with connect() as db:
+        db.query("""
+            CREATE OR REPLACE TABLE story_sentiments AS
+            SELECT
+                data.story_id
+                ,data.sentiment_id as class_id
+                ,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label
+            FROM data
+            JOIN stories s
+            ON s.id = data.story_id
+        """)
--- a/src/links.py
+++ b/src/links.py
@ -1,255 +0,0 @@
-import click
-from data.main import connect
-import pandas as pd
-import numpy as np
-import seaborn as sns
-import matplotlib.pyplot as plt
-
-
-@click.command('links:create-table')
-def create_table():
-
-    table_name = "top.link_edges"
-    DB = connect()
-    DB.query(f"""
-        CREATE OR REPLACE TABLE {table_name} AS
-        with cte as(
-            SELECT 
-                s.publisher_id as parent_id
-                ,r.publisher_id as child_id
-                ,count(1) as links
-            FROM top.stories s
-            JOIN top.related_stories r
-            ON s.id = r.parent_id
-            group by 
-                s.publisher_id
-                ,r.publisher_id
-        )
-        SELECT
-            cte.parent_id
-            ,cte.child_id
-            ,cte.links as links
-            ,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
-            ,case when cte.links > 0 then 1 else 0 end as onehot
-        FROM cte
-        WHERE cte.child_id in (
-            SELECT
-                distinct parent_id
-            FROM cte
-        )
-        AND cte.parent_id in (
-            SELECT
-                distinct child_id
-            FROM cte
-        )
-    """)
-    DB.close()
-
-    DB = connect()
-    DB.query("""
-        SELECT
-        *
-        ,-log10(links)
-        --distinct parent_id
-        FROM top.link_edges e
-        WHERE e.parent_id = 238
-    """)
-    DB.close()
-    print(f"created {table_name}")
-
-@click.command('links:create-pca')
-@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
-def create_pca(source):
-    """create 2D pca labels"""
-
-    from sklearn.decomposition import PCA
-
-    table_name = f"top.publisher_pca_{source}"
-    DB = connect()
-    pub = DB.query("""
-        SELECT
-            *
-        FROM top.publishers
-    """).df()
-    df = DB.query(f"""
-        SELECT
-            parent_id
-            ,child_id
-            ,{source} as links
-        FROM top.link_edges
-    """).df()
-    DB.close()
-    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
-
-    svd = PCA(n_components=2)
-    svd_out = svd.fit_transform(pivot)
-
-    out = pivot.reset_index()[['parent_id']]
-    out['first'] = svd_out[:, 0]
-    out['second'] = svd_out[:, 1]
-    out = pd.merge(out, pub, left_on='parent_id', right_on='id')
-
-    DB = connect()
-    DB.query(f"""
-        CREATE OR REPLACE TABLE {table_name} AS
-        SELECT
-            out.id as publisher_id
-            ,out.first as first
-            ,out.second as second
-        FROM out
-    """)
-    DB.close()
-    print(f"created {table_name}")
-
-
-@click.command('links:create-clusters')
-@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
-def create_clusters(source):
-    from sklearn.cluster import KMeans
-
-    table_name = f"top.publisher_clusters_{source}"
-    DB = connect()
-    df = DB.query(f"""
-        SELECT
-            parent_id
-            ,child_id
-            ,{source} as links
-        FROM top.link_edges
-    """).df()
-    pub = DB.query("""
-        SELECT
-            *
-        FROM top.publishers
-    """).df()
-    DB.close()
-    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
-
-
-    k = 8
-    kmeans = KMeans(n_clusters=k, n_init="auto")
-    pred = kmeans.fit_predict(pivot)
-    out = pivot.reset_index()[['parent_id']]
-    out['label'] = pred
-    out = pd.merge(out, pub, left_on='parent_id', right_on='id')
-    new_table = out[['id', 'label']]
-
-    DB = connect()
-    DB.query(f"""
-        CREATE OR REPLACE TABLE {table_name} AS
-        SELECT
-            n.id as publisher_id
-            ,n.label as label
-        FROM new_table n
-    """)
-    DB.close()
-    print(f"created {table_name}")
-
-def to_matrix():
-    """returns an adjacency matrix of publishers to publisher link frequency"""
-
-    DB = connect()
-
-    bias_map = pd.DataFrame([
-        {'label' :'left', 'value' : 0},
-        {'label' :'left-center', 'value' : 1},
-        {'label' :'center', 'value' : 2},
-        {'label' :'right-center', 'value' : 3},
-        {'label' :'right', 'value' : 4},
-        {'label' :'allsides', 'value' : -1},
-    ])
-
-    bias = DB.sql("""
-        SELECT
-            b.id
-            ,b.label
-            ,m.value
-        FROM publisher_bias b
-        JOIN bias_map m
-        ON b.label = m.label
-        WHERE value != -1
-    """).df()
-
-    pub = DB.sql("""
-            select 
-                p.id
-                ,p.name
-                ,p.url
-            from publishers p
-    """).df()
-
-    edges = DB.sql("""
-        WITH total as (
-            SELECT
-                s.publisher_id as id
-                ,COUNT(1) as stories
-            FROM stories s
-            GROUP BY 
-                s.publisher_id
-        ), p as (
-            SELECT
-                p.id
-                ,stories
-            FROM publishers p
-            LEFT JOIN total t
-            ON t.id = p.id
-            WHERE t.stories >= 20
-        ), cte as (
-            SELECT 
-                r.publisher_id as child_id
-                ,s.publisher_id as parent_id
-                ,count(1) as links
-            FROM related_stories r
-            JOIN stories s
-            ON s.id = r.parent_id
-            group by 
-                s.publisher_id
-                ,r.publisher_id
-        )
-        SELECT
-            p.id as parent_id
-            ,cte.child_id
-            ,links
-        FROM p
-        left JOIN cte
-        ON p.id = cte.parent_id
-    """).df()
-
-    # only keep values that have more than 1 link
-    test = edges[edges['links'] > 2].pivot(index='parent_id', columns='child_id', values='links').fillna(0).reset_index()
-    edges.dropna().pivot(index='parent_id', columns='child_id', values='links').fillna(0)
-    pd.merge(adj, pub, how='left', left_on='parent_id', right_on='id')
-    adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
-    adj.values.shape
-
-    
-    out = pd.DataFrame(adj.index.values, columns=['id'])
-    out = pd.merge(out, pub, how='left', on='id')
-    return out
-
-@click.command('links:analysis')
-def analysis():
-    from sklearn.decomposition import PCA, TruncatedSVD
-    from sklearn.cluster import MiniBatchKMeans
-    adj = to_matrix()
-    pca = PCA(n_components=4)
-    pca_out = pca.fit_transform(adj)
-
-    svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
-    svd_out = svd.fit_transform(adj)
-
-    x = svd_out[:, 0]
-    y = svd_out[:, 1]
-
-    x = pca_out[:, 0]
-    y = pca_out[:, 1]
-    sns.scatterplot(x=x, y=y)
-    plt.show()
-
-    kmeans = MiniBatchKMeans(n_clusters=5, random_state=0, batch_size=6, n_init="auto")
-    pred = kmeans.fit_predict(pca_out)
-
-    sns.scatterplot(x=x, y=y, hue=pred)
-    plt.show()
-
-    sns.scatterplot(x=pub['first'], y=pub['second'], hue=pub['bias'])
-    plt.show()
--- a/src/mine.py
+++ b/src/mine.py
@ -1,6 +1,5 @@
-from data.main import data_dir, connect
+from data.main import connect, paths
 import numpy as np
-import sklearn
 from sklearn.cluster import MiniBatchKMeans
 import click
 from pathlib import Path
@ -11,7 +10,7 @@ from enum import Enum, auto

@click.command(name="mine:embeddings")
 def embeddings():
-    data = np.load(data_dir() / "embeddings.npy")
+    data = np.load(paths('data') / "embeddings.npy")
    kmeans = MiniBatchKMeans(n_clusters=5,
                             random_state=0,
                             batch_size=6,
@ -76,7 +75,7 @@ class PlotName(str, Enum):
@click.option('-n', '--name', required=True, type=click.Choice(PlotName))
@click.option('-o', '--output', required=False, type=click.Path())
 def plot(name: PlotName, output: Path):
-    output = output if output else APP_DIR / f'docs/{name}.png'
+    output = output if output else paths('figures') / f'{name}.png'
    if name == PlotName.TitleLength:
        fig, ax = plt.subplots(1,1)
        data = db.sql("""
--- a/src/mining/bias.py
+++ b/src/mining/bias.py
@ -0,0 +1,36 @@
+from data.main import connect, map_tld
+import os
+from pathlib import Path
+
+def normalize():
+    with connect() as db:
+        db.sql("""
+            SELECT
+                p.name
+                ,count(1) as ctn
+                ,sum(ctn) over() as all
+            FROM mbfc.publishers p
+            JOIN mbfc.publisher_stories ps
+            ON ps.publisher_id = p.id
+            JOIN stories s
+            ON s.id = ps.story_id
+            GROUP BY
+                p.name
+        """)
+
+    with connect() as db:
+        db.sql("""
+            SELECT
+                bias
+                ,count(distinct p.id) as publishers
+                ,count(1) as stories
+                ,count(1) / count(distinct p.id) as ratio
+            FROM mbfc.publishers p
+            JOIN mbfc.publisher_stories ps
+            ON ps.publisher_id = p.id
+            JOIN stories s
+            ON s.id = ps.story_id
+            GROUP BY
+                p.bias
+            ORDER BY count(1)
+        """)
--- a/src/plots/init.py
+++ b/src/plots/init.py
@ -1,9 +1,13 @@
 import plots.sentence
 import plots.emotion
 import plots.sentiment
+import plots.links
+import plots.classifier

 __all__ = [
    'sentence'
    'emotion',
    'sentiment',
+    'links',
+    'classifier',
 ]
--- a/src/plots/bias.py
+++ b/src/plots/bias.py
@ -1,5 +1,5 @@
 import click
-from data.main import connect
+from data.main import connect, bias_label_to_int, ticklabels
 import os
 from pathlib import Path
 import seaborn as sns
@ -7,54 +7,53 @@ import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd

-out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
-
@click.command('plot:bias-hist')
 def hist():
-    filename = "bias_hist.png"
+    save_to = paths('figures') / "bias_hist.png"
+
+    with connect() as db:
+        data = db.sql("""
+            SELECT
+                p.ordinal
+                ,count(1) as stories
+            FROM stories s
+            JOIN mbfc.publisher_stories ps
+            ON s.id = ps.story_id
+            JOIN mbfc.publishers p
+            ON ps.publisher_id = p.id
+            WHERE ordinal != -1
+            GROUP BY
+                p.ordinal
+        """).df()

-    DB = connect()
-    data = DB.sql("""
-        SELECT
-            b.ordinal
-            ,count(1) as stories
-        FROM stories s
-        JOIN publisher_bias pb
-        ON pb.publisher_id = s.publisher_id
-        JOIN bias_ratings b
-        ON b.id = pb.bias_id
-        GROUP BY
-            b.ordinal
-    """).df()
-    DB.close()

    ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
-    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
-    ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
+    ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
    plt.tight_layout()
-    plt.savefig(out_path / filename)
-    print(f"saved: {filename}")
+    plt.savefig(save_to)
+    plt.close()
+    print(f"saved: {save_to}")
+
@click.command('plot:bias-publisher-hist')
 def publisher_hist():
-    filename = "bias_publisher_hist.png"
+    save_to = paths('figures') / "bias_publisher_hist.png"

-    DB = connect()
-    data = DB.sql("""
-        SELECT
-            b.ordinal
-            ,count(1) as publishers
-        FROM publisher_bias pb
-        JOIN bias_ratings b
-        ON b.id = pb.bias_id
-        GROUP BY
-            b.ordinal
-    """).df()
-    DB.close()
+    with connect() as db:
+        data = db.sql("""
+            SELECT
+                p.ordinal
+                ,count(distinct p.id) as publishers
+            FROM mbfc.publishers p
+            JOIN mbfc.publisher_stories ps
+            ON ps.publisher_id = p.id
+            WHERE ordinal != -1
+            GROUP BY
+                p.ordinal
+        """).df()

    ax = sns.barplot(x=data['ordinal'], y=data['publishers'], color='tab:blue')
-    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
-    ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels)
+    ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels())
    plt.tight_layout()
-    plt.savefig(out_path / filename)
+    plt.savefig(save_to)
    plt.close()
-    print(f"saved: {filename}")
+    print(f"saved: {save_to}")
--- a/src/plots/classifier.py
+++ b/src/plots/classifier.py
@ -5,30 +5,32 @@ import seaborn as sns
 import matplotlib.pyplot as plt
 from pathlib import Path

-out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
-
@click.command('plot:pca-with-classes')
-def pca_with_classes():
-    filename = "pca_with_classes.png"
+@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
+def pca_with_classes(source):

-    DB = connect()
-    data = DB.query(f"""
-        SELECT
-            p.tld
-            ,b.bias
-            ,c.first
-            ,c.second
-            ,round(cast(b.agree as float) / (b.agree + b.disagree), 2) ratio
-        FROM top.publishers p
-        JOIN top.publisher_bias pb
-        ON p.id = pb.publisher_id
-        JOIN bias_ratings b
-        ON b.id = pb.bias_id
-        JOIN top.publisher_pca_normalized c
-        ON c.publisher_id = p.id
-    """).df()
-    DB.close()
-    ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['bias'], s=100)
-    ax.set(title="pca components vs. bias labels", xlabel="first pca component", ylabel="second pca component")
-    plt.savefig(out_dir / filename)
-    print(f"saved: {filename}")
+    save_to = paths('figures') / f"link_{source}_pca_with_classes.png"
+
+    with connect() as db:
+        df = db.query(f"""
+            SELECT
+                p.tld
+                ,p.bias
+                ,c.first
+                ,c.second
+                --,round(cast(b.agree as float) / (b.agree + b.disagree), 2) ratio
+            FROM mbfc.publishers p
+            JOIN publisher_pca_{source} c
+            ON c.publisher_id = p.id
+            WHERE p.ordinal != -1
+            ORDER BY p.ordinal
+        """).df()
+
+    ax = sns.relplot(df, x='first', y='second', hue='bias', col='bias', s=100, palette='rainbow')
+    ax.set(xlabel="first pca component", 
+           ylabel="second pca component")
+    ax.figure.suptitle="pca components vs. bias labels"
+    plt.savefig(save_to)
+    plt.close()
+    print(f"saved: {save_to}")
+    os.system(f'xdg-open {save_to}')
--- a/src/plots/descriptive.py
+++ b/src/plots/descriptive.py
@ -1,169 +1,190 @@
 import click
-from data.main import connect
+from data.main import connect, paths
 import os
 import seaborn as sns
 import matplotlib.pyplot as plt
 from pathlib import Path
 import numpy as np

-out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
-
@click.command('plot:articles-per-year')
 def articles_per_year():
-    filename = 'articles_per_year.png'
+    save_to = paths('figures') / 'articles_per_year.png'

-    DB = connect()
-    data = DB.query("""
-        select
-            year(published_at) as year
-            ,count(1) as stories
-        from stories
-        group by 
-            year(published_at)
-    """).df()
-    DB.close()
+    with connect() as db:
+        data = DB.query("""
+            select
+                year(published_at) as year
+                ,count(1) as stories
+            from stories
+            group by 
+                year(published_at)
+        """).df()

    ax = sns.barplot(x=data.year, y=data.stories, color='tab:blue')
    ax.tick_params(axis='x', rotation=90)
    ax.set(title="count of articles per year", ylabel="count of stories (#)")
    plt.tight_layout()
-    plt.savefig(out_dir / filename)
+    plt.savefig(save_to)
+    print(f"saved: {save_to}")

@click.command('plot:distinct-publishers')
 def distinct_publishers():
-    filename = 'distinct_publishers.png'
+    save_to = paths('figures') / 'distinct_publishers.png'

-    DB = connect()
-    data = DB.query("""
-        select
-            year(published_at) as year
-            ,count(distinct publisher_id) as publishers
-        from stories
-        group by 
-            year(published_at)
-    """).df()
-    DB.close()
+    with connect() as db:
+        data = DB.query("""
+            select
+                year(published_at) as year
+                ,count(distinct publisher_id) as publishers
+            from stories
+            group by 
+                year(published_at)
+        """).df()

    ax = sns.barplot(x=data.year, y=data.publishers, color='tab:blue')
    ax.tick_params(axis='x', rotation=90)
    ax.set(title="count of publishers per year", ylabel="count of publishers (#)")
    plt.tight_layout()
-    plt.savefig(out_dir / filename)
+    plt.savefig(save_to)
    plt.close()
+    print(f"saved: {save_to}")

@click.command('plot:stories-per-publisher')
 def stories_per_publisher():
-    filename = 'stories_per_publisher.png'
+    save_to = paths('figures') / 'stories_per_publisher.png'

-    DB = connect()
-    data = DB.query("""
-        with cte as (
-        select
-            publisher_id
-            ,year(published_at) as year
-            ,count(1) as stories
-        from stories
-        group by 
-            publisher_id
-            ,year(published_at)
-        ) , agg as (
+    with connect() as db:
+        data = db.query("""
+            with cte as (
            select
-                publisher_id
-                ,avg(stories) as stories_per_year
-                ,case 
-                    when avg(stories) < 2 then 2
-                    when avg(stories) < 4 then 4
-                    when avg(stories) < 8 then 8
-                    when avg(stories) < 16 then 16
-                    when avg(stories) < 32 then 32
-                    when avg(stories) < 64 then 64
-                    when avg(stories) < 128 then 128
-                    else 129
-                end as max_avg
-            from cte
+                ps.publisher_id
+                ,year(s.published_at) as year
+                ,count(1) as stories
+            from stories s
+            join mbfc.publisher_stories ps
+            on ps.story_id = s.id
            group by 
-                publisher_id
-        )
-        select
-            max_avg
-            ,count(1) as publishers
-        from agg
-        group by
-            max_avg
-    """).df()
-    DB.close()
+                ps.publisher_id
+                ,year(s.published_at)
+            ) , agg as (
+                select
+                    publisher_id
+                    ,avg(stories) as stories_per_year
+                    ,case 
+                        when avg(stories) < 2 then 2
+                        when avg(stories) < 4 then 4
+                        when avg(stories) < 8 then 8
+                        when avg(stories) < 16 then 16
+                        when avg(stories) < 32 then 32
+                        when avg(stories) < 64 then 64
+                        when avg(stories) < 128 then 128
+                        else 129
+                    end as max_avg
+                from cte
+                group by 
+                    publisher_id
+            )
+            select
+                max_avg
+                ,count(1) as publishers
+            from agg
+            group by
+                max_avg
+        """).df()

    ax = sns.barplot(x=data.max_avg, y=data.publishers, color='tab:blue')
-    ax.set(title="histogram of publisher stories per year", ylabel="count of publishers (#)", xlabel="max average stories / year")
+    ax.set(title="histogram of publisher stories per year", ylabel="count of publishers (#)", xlabel="avg. stories / year", xticklabels=['2', '4', '8', '16', '32', '64', '128', '>128'])
    plt.tight_layout()
-    plt.savefig(out_dir / filename)
+    plt.savefig(save_to)
    plt.close()
+    print(f"saved: {save_to}")


@click.command('plot:top-publishers')
 def top_publishers():
    """plot top publishers over time"""

-    filename = 'top_publishers.png'
+    save_to = paths('figures') / 'top_publishers.png'

-    DB = connect()
-    data = DB.query("""
-        select
-            p.tld
-            ,year(published_at) as year
-            ,count(1) as stories
-        from (
-            select
+    with connect() as db:
+        db.query("""
+                SELECT
+                    p.tld
+                    ,p.id
+                FROM mbfc.publishers p
+                JOIN mbfc.publisher_stories ps
+                ON ps.publisher_id = p.id
+                JOIN stories s
+                ON s.id = ps.story_id
+                GROUP BY
+                    p.tld
+                    ,p.id
+                order by count(1) desc
+                limit 20
+            """)
+
+    with connect() as db:
+        data = db.query("""
+            WITH p as ( 
+                SELECT
+                    p.tld
+                    ,p.id
+                FROM mbfc.publishers p
+                JOIN mbfc.publisher_stories ps
+                ON ps.publisher_id = p.id
+                JOIN stories s
+                ON s.id = ps.story_id
+                GROUP BY
+                    p.tld
+                    ,p.id
+                order by count(1) desc
+                limit 20
+            ) 
+            SELECT
                p.tld
-                ,p.id
-            from top.publishers p
-            join top.stories s
-            on s.publisher_id = p.id
-            group by
+                ,YEAR(s.published_at) AS year
+                ,COUNT(1) AS stories
+            FROM stories s
+            JOIN mbfc.publisher_stories ps
+            ON ps.story_id = s.id
+            JOIN p
+            ON p.id = ps.publisher_id
+            GROUP by 
                p.tld
-                ,p.id
-            order by count(1) desc
-            limit 20
-        ) p
-        join top.stories s
-        on s.publisher_id = p.id
-        group by 
-            p.tld
-            ,year(published_at)
-        order by count(distinct s.id) desc
-    """).df()
-    DB.close()
+                ,YEAR(published_at)
+            ORDER BY year, COUNT(DISTINCT s.id) DESC
+        """).df()
    
    pivot = data.pivot(columns='year', index='tld', values='stories')
    ax = sns.heatmap(pivot, cmap="crest")
    ax.set(title="top 20 publishers (by tld)", ylabel="tld", xlabel="stories / year (#)")
    plt.tight_layout()
-    plt.savefig(out_dir / filename)
+    plt.savefig(save_to)
    plt.close()
+    print(f"saved: {save_to}")


@click.command('plot:common_tld')
 def common_tld():
    import dataframe_image as dfi
-    filename = 'common_tld.png'
+    save_to = paths('figures') / 'common_tld.png'

-    DB = connect()
-    data = DB.query("""
-        select
-            split_part(url, '.', -1) as tld
-            ,count(1) as publishers
-            ,case when count(1) < 20
-                then string_agg(distinct url, '\t')
-                else NULL
-                end as urls
-        from publishers
-        group by
-            split_part(url, '.', -1)
-        order by 
-            count(1) desc
-    """).df()
-    DB.close()
-    data[:15][['tld', 'publishers']].style.hide(axis="index").export_png(out_dir / filename, table_conversion='matplotlib')
+    with connect() as db:
+        data = db.query("""
+            select
+                split_part(url, '.', -1) as tld
+                ,count(1) as publishers
+                ,case when count(1) < 20
+                    then string_agg(distinct url, '\t')
+                    else NULL
+                    end as urls
+            from publishers
+            group by
+                split_part(url, '.', -1)
+            order by 
+                count(1) desc
+        """).df()
+    data[:15][['tld', 'publishers']].style.hide(axis="index").export_png(save_to, table_conversion='matplotlib')

 def stats():

@ -246,7 +267,7 @@ def stats():
@click.command('plot:bias-stats')
 def bias_stats():
    import dataframe_image as dfi
-    filename = 'bias_stats.png'
+    save_to = paths('figures') / 'bias_stats.png'

    DB = connect()

@ -300,3 +321,69 @@ def bias_stats():
    """).df()
    DB.close()
    print(df.to_markdown(index=False))
+
+@click.command('plot:bias-over-time')
+def bias_over_time():
+    """plot bias labels over time"""
+
+    save_to = paths('figures') / 'bias_over_time.png'
+
+    with connect() as db:
+        df = db.sql("""
+            SELECT
+                p.bias
+                ,p.id
+                ,date_trunc('year', s.published_at) as year
+                ,count(1) as stories
+            FROM stories s
+            JOIN mbfc.publisher_stories ps
+            ON ps.story_id = s.id
+            JOIN mbfc.publishers p
+            ON p.id = ps.publisher_id
+            where year(s.published_at) not in (2006, 2023)
+            and p.ordinal != -1
+            GROUP BY
+                p.bias
+                ,p.id
+                ,p.ordinal
+                ,date_trunc('year', s.published_at)
+            order by 
+                p.ordinal
+                ,date_trunc('year', s.published_at)
+        """).df()
+
+    ax = sns.relplot(df, kind='line', x='year', y='stories', col='bias', units='id', estimator=None, palette='rainbow')
+    ax.set(ylabel="stories", xlabel="year")
+    plt.tight_layout()
+    plt.savefig(save_to)
+    plt.close()
+    print(f"saved: {save_to}")
+
+def bias_missing():
+
+    with connect() as db:
+        df = db.sql("""
+            SELECT
+                date_trunc('year', s.published_at) as year
+                ,s.tld
+                ,count(1) as stories
+            FROM stories s
+            LEFT JOIN mbfc.publisher_stories ps
+            ON ps.story_id = s.id
+            WHERE ps.publisher_id is NULL
+            AND year(s.published_at) not in (2006, 2023)
+            GROUP BY
+                s.tld
+                ,date_trunc('year', s.published_at)
+            HAVING count(1) > 10
+            ORDER BY 
+                date_trunc('year', s.published_at)
+        """).df()
+
+        ax = sns.lineplot(df, x='year', y='stories', units='tld', estimator=None)
+        ax.set(ylabel="stories", xlabel="year")
+        plt.tight_layout()
+        plt.show()
+        #plt.savefig(save_to)
+        plt.close()
+        #print(f"saved: {save_to}")
--- a/src/plots/emotion.py
+++ b/src/plots/emotion.py
@ -1,77 +1,79 @@
 import click
-from data.main import connect
+from data.main import connect, paths, ticklabels
 import os
-from pathlib import Path
 import seaborn as sns
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd

-out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
-
@click.command('plot:emotion-over-time')
 def emotion_over_time():
-    filename = "emotion_over_time.png"
-    DB = connect()

-    emotions = DB.sql("""
-        SELECT
-            date_trunc('year', s.published_at) AS year
-            ,e.label AS emotion
-            ,count(1) AS stories
-        FROM top.stories s
-        JOIN story_emotions se
-        ON s.id = se.story_id
-        JOIN emotions e
-        ON e.id = se.emotion_id
-        GROUP by
-            date_trunc('year', s.published_at)
-            ,e.label
-    """).df()
-    DB.close()
+    filename = "emotion_over_time.png"
+    save_to = paths('figures') / filename
+
+    with connect() as db:
+        emotions = db.sql("""
+            SELECT
+                date_trunc('year', s.published_at) AS year
+                ,e.label AS emotion
+                ,count(1) AS stories
+            FROM stories s
+            JOIN story_emotions se
+            ON s.id = se.story_id
+            JOIN emotions e
+            ON e.id = se.emotion_id
+            GROUP by
+                date_trunc('year', s.published_at)
+                ,e.label
+        """).df()

    ax = sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
    ax.set(title="title emotions over years", xlabel="year", ylabel="stories (#)")
-    plt.savefig(out_path / filename)
-    print(f"saved: {filename}")
+    plt.savefig(save_to)
+    plt.close()
+    print(f"saved: {save_to}")
+    os.system(f'xdg-open {save_to}')

@click.command('plot:emotion-regression')
 def emotion_regression():
+    """plot emotion over time as regression"""
+
    from sklearn import linear_model
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import ConfusionMatrixDisplay

    filename = "emotion_regression.png"
+    save_to = paths('figures') / filename


-    DB = connect()
-    emotions = DB.query("""
-        SELECT
-        label
-        FROM emotions e
-    """).df()['label'].to_list()
-    DB.close()
-
-    DB = connect()
-    df = DB.sql(f"""
-        SELECT
-            epoch(date_trunc('yearweek', s.published_at)) AS date
-            ,e.id AS emotion_id
-            ,p.id as publisher_id
-            ,count(1) AS stories
-        FROM top.stories s
-        JOIN top.publishers p
-        ON p.id = s.publisher_id
-        JOIN story_emotions se
-        ON s.id = se.story_id
-        JOIN emotions e
-        ON e.id = se.emotion_id
-        GROUP by
-            epoch(date_trunc('yearweek', s.published_at))
-            ,p.id
-            ,e.id
-    """).df()
-    DB.close()
+    with connect() as db:
+        #emotions = db.query("""
+        #    SELECT
+        #        label
+        #    FROM emotions e
+        #""").df()['label'].to_list()
+        df = db.sql(f"""
+            SELECT
+                epoch(date_trunc('yearweek', s.published_at)) AS date
+                ,e.id AS emotion_id
+                ,p.id as publisher_id
+                ,count(1) AS stories
+            FROM stories s
+            JOIN mbfc.publisher_stories ps
+            ON ps.story_id = s.id
+            JOIN mbfc.publishers p
+            ON p.id = ps.publisher_id
+            JOIN story_emotions se
+            ON s.id = se.story_id
+            JOIN emotions e
+            ON e.id = se.emotion_id
+            WHERE p.ordinal != -1
+            GROUP by
+                epoch(date_trunc('yearweek', s.published_at))
+                ,p.id
+                ,e.id
+        """).df()

    results = []
    for (emotion_id, publisher_id), group in df.groupby(['emotion_id', 'publisher_id']):
@ -83,77 +85,59 @@ def emotion_regression():
        results.append({'emotion_id' : emotion_id, 'publisher_id':publisher_id, 'per_year' : per_year})
    results = pd.DataFrame(results)

-    DB = connect()
-    out = DB.query("""
-        SELECT
-            e.label as emotion
-            --,p.tld
-            ,avg(results.per_year) as avg_reg_coef
-            ,b.ordinal
-        FROM results
-        JOIN emotions e
-        ON e.id = results.emotion_id
-        JOIN top.publishers p
-        ON p.id = results.publisher_id
-        JOIN publisher_bias pb
-        ON pb.publisher_id = results.publisher_id
-        JOIN bias_ratings b
-        ON b.id = pb.bias_id
-        GROUP BY
-            e.label
-            ,b.ordinal
-    """).df()
-    DB.close()
-    pivot = out.pivot(index=['emotion'], columns=['ordinal'], values=['avg_reg_coef'])
+    with connect() as db:
+        out = db.query("""
+            SELECT
+                e.label as emotion
+                ,avg(results.per_year) as avg_reg_coef
+                ,p.bias
+            FROM results
+            JOIN emotions e
+            ON e.id = results.emotion_id
+            JOIN mbfc.publishers p
+            ON p.id = results.publisher_id
+            GROUP BY
+                e.label
+                ,p.bias
+        """).df()

-    ax = sns.heatmap(pivot, cmap='RdBu_r')
-    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
+    pivot = out.pivot(index=['emotion'], columns=['bias'], values=['avg_reg_coef'])
+
+    ax = sns.heatmap(pivot, cmap='BrBG', vmin=-0.01, vmax=0.01, center=0)
+    #ax = sns.heatmap(pivot, cmap='RdBu_r', center=0)
    ax.set(title="slope of regression (stories/year) by bias and emotion"
-           ,xticklabels=ticklabels
+           ,xticklabels=ticklabels()
           ,xlabel="bias"
           ,ylabel="emotion")
    plt.tight_layout()
-    plt.savefig(out_path / filename)
-    print(f"saved: {filename}")
+    plt.savefig(save_to)
+    plt.close()
+    print(f"saved: {save_to}")

@click.command('plot:emotion-hist')
 def emotion_hist():
+
    filename = "emotion_hist.png"
+    save_to = paths('figures') / filename

-    DB = connect()
-    DB.query("""describe story_emotions""")
+    with connect() as db:
+        data = db.sql("""
+            SELECT
+                p.bias
+                ,count(1) as stories
+            FROM stories s
+            JOIN mbfc.publisher_stories ps
+            ON ps.story_id = s.id
+            JOIN mbfc.publishers p
+            ON p.id = ps.publisher_id
+            WHERE p.ordinal != -1
+            GROUP BY
+                p.bias
+        """).df()

-    DB.query("""
-        select
-            e.label
-            ,count(distinct s.id) as stories
-            ,count(distinct s.publisher_id) as publishers
-        from story_emotions se
-        join emotions e
-        on e.id = se.emotion_id
-        join top.stories s
-        on s.id = se.story_id
-        group by
-            e.label
-    """).df().to_markdown(index=False)
-
-    data = DB.sql("""
-        SELECT
-            b.ordinal
-            ,count(1) as stories
-        FROM stories s
-        JOIN publisher_bias pb
-        ON pb.publisher_id = s.publisher_id
-        JOIN bias_ratings b
-        ON b.id = pb.bias_id
-        GROUP BY
-            b.ordinal
-    """).df()
-    DB.close()
-
-    ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
-    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
-    ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
+    ax = sns.barplot(data, x='bias', y='stories', palette='rainbow', order=ticklabels())
+    ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
    plt.tight_layout()
-    plt.savefig(out_path / filename)
-    print(f"saved: {filename}")
+    plt.savefig(save_to)
+    plt.close()
+    print(f"saved: {save_to}")
--- a/src/plots/links.py
+++ b/src/plots/links.py
@ -9,20 +9,20 @@ import numpy as np
 from sklearn.metrics import silhouette_score
 import pandas as pd

-out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'

@click.command('plot:link-elbow')
 def elbow():
    from sklearn.cluster import KMeans

-    filename = 'link_cluster_elbow.png'
+    save_to = paths('figures') / 'link_cluster_elbow.png'
+
+    with connect() as db:
+        df = db.query("""
+            SELECT
+                *
+            FROM link_edges
+        """).df()

-    DB = connect()
-    df = DB.query("""
-        SELECT
-            *
-        FROM link_edges
-    """).df()
    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)

    to_plot = []
@ -36,8 +36,9 @@ def elbow():

    ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia)
    ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
-    plt.savefig(out_dir / filename)
+    plt.savefig(save_to)
    plt.close()
+    print(f"saved plot: {save_to}")

    # randomly pick 8

@ -45,72 +46,65 @@ def elbow():
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
 def link_pca_clusters(source):

-    filename = f"link_pca_clusters_{source}.png"
+    save_to = paths('figures') / f"link_pca_clusters_{source}.png"

-    DB = connect()
-    df = DB.query(f"""
-        SELECT
-            c.label as cluster
-            ,p.tld
-            --,b.label as bias
-            ,pca.first
-            ,pca.second
-            ,s.cnt as stories
-        FROM top.publisher_clusters_{source} c
-        JOIN top.publishers p
-        ON c.publisher_id = p.id
-        JOIN 
-        (
-            select
-                s.publisher_id
-                ,count(1) as cnt
-            FROM top.stories s
-            GROUP BY
-                s.publisher_id
-        ) s
-        ON s.publisher_id = p.id
-        JOIN top.publisher_pca_{source} pca
-        ON pca.publisher_id = p.id
-    """).df()
-    DB.close()
+    with connect() as db:
+        df = db.query(f"""
+            SELECT
+                c.label as cluster
+                ,p.tld
+                --,b.label as bias
+                ,pca.first
+                ,pca.second
+                ,s.cnt as stories
+            FROM top.publisher_clusters_{source} c
+            JOIN top.publishers p
+            ON c.publisher_id = p.id
+            JOIN 
+            (
+                select
+                    s.publisher_id
+                    ,count(1) as cnt
+                FROM top.stories s
+                GROUP BY
+                    s.publisher_id
+            ) s
+            ON s.publisher_id = p.id
+            JOIN top.publisher_pca_{source} pca
+            ON pca.publisher_id = p.id
+        """).df()

    ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster'])
    ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component")
-    plt.savefig(out_dir / filename)
-
-    # .df().groupby(['cluster', 'bias']).describe()
-
-
+    plt.savefig(save_to)
+    print(f"saved plot: {save_to}")


 def test():
-    data_dir = Path(os.getenv('DATA_MINING_DATA_DIR'))

-    DB.query("""
-        SELECT
-            p.id as publisher_id
-            ,p.name
-            ,p.tld
-            ,cast(b.bias_id as int) as bias_id
-            ,count(1) as stories
-        FROM publishers p
-        JOIN stories s
-        ON s.publisher_id = p.id
-        JOIN publisher_clusters c
-        ON c.publisher_id = p.id
-        LEFT JOIN publisher_bias b
-        ON b.publisher_id = p.id
-        where bias_id is null
-        group by
-            p.id
-            ,p.name
-            ,p.tld
-            ,b.bias_id
-        ORDER BY count(1) desc
-    """)
-
-    # .df().to_csv(data_dir / 'cluster_publishers.csv', sep="|", index=False)
-    DB.close()
+    with connect() as db:
+        db.query("""
+            SELECT
+                p.id as publisher_id
+                ,p.name
+                ,p.tld
+                ,cast(b.bias_id as int) as bias_id
+                ,count(1) as stories
+            FROM publishers p
+            JOIN stories s
+            ON s.publisher_id = p.id
+            JOIN publisher_clusters c
+            ON c.publisher_id = p.id
+            LEFT JOIN publisher_bias b
+            ON b.publisher_id = p.id
+            where bias_id is null
+            group by
+                p.id
+                ,p.name
+                ,p.tld
+                ,b.bias_id
+            ORDER BY count(1) desc
+        """)


@click.command('plot:link-confusion')
@ -120,34 +114,36 @@ def link_confusion():
    from sklearn.metrics import ConfusionMatrixDisplay

    filename = "link_confusion.png"
+    save_to = paths('figures') / filename

-    DB = connect()
-    bias = DB.query("""
-        SELECT
-            p.id as publisher_id
-            ,b.ordinal
-        FROM top.publishers p
-        JOIN top.publisher_bias pb
-        ON pb.publisher_id = p.id
-        JOIN bias_ratings b
-        ON b.id = pb.bias_id
-    """).df()
+    with connect() as db:
+        bias = db.query("""
+            SELECT
+                p.id as publisher_id
+                ,b.ordinal
+            FROM top.publishers p
+            JOIN top.publisher_bias pb
+            ON pb.publisher_id = p.id
+            JOIN bias_ratings b
+            ON b.id = pb.bias_id
+        """).df()
+
+        df = db.query("""
+            SELECT
+                *
+            FROM top.link_edges
+            WHERE parent_id in (
+                select
+                    publisher_id
+                from bias
+            )
+            AND child_id in (
+                select
+                    publisher_id
+                from bias
+            )
+        """).df()

-    df = DB.query("""
-        SELECT
-            *
-        FROM top.link_edges
-        WHERE parent_id in (
-            select
-                publisher_id
-            from bias
-        )
-        AND child_id in (
-            select
-                publisher_id
-            from bias
-        )
-    """).df()
    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)

    x = pivot.values
@ -166,9 +162,9 @@ def link_confusion():
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
    ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
-    plt.savefig(out_dir / filename)
+    plt.savefig(save_to)
    plt.close()
-    print(f"saved plot: {filename}")
+    print(f"saved plot: {save_to}")

@click.command('plot:link-classifier')
 def link_confusion():
@ -176,49 +172,51 @@ def link_confusion():
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.metrics import ConfusionMatrixDisplay

-    filename = "link_confusion.png"
+    save_to = paths('figures') / "link_confusion.png"

-    DB = connect()
-    bias = DB.query("""
-        SELECT
-            p.id as publisher_id
-            ,b.ordinal
-        FROM top.publishers p
-        JOIN top.publisher_bias pb
-        ON pb.publisher_id = p.id
-        JOIN bias_ratings b
-        ON b.id = pb.bias_id
-    """).df()
+    with connect() as db:
+        bias = db.query("""
+            SELECT
+                p.id as publisher_id
+                ,b.ordinal
+            FROM top.publishers p
+            JOIN top.publisher_bias pb
+            ON pb.publisher_id = p.id
+            JOIN bias_ratings b
+            ON b.id = pb.bias_id
+        """).df()
+
+        df = db.query("""
+            SELECT
+                *
+            FROM top.link_edges
+            WHERE parent_id in (
+                select
+                    publisher_id
+                from bias
+            )
+            AND child_id in (
+                select
+                    publisher_id
+                from bias
+            )
+        """).df()

-    df = DB.query("""
-        SELECT
-            *
-        FROM top.link_edges
-        WHERE parent_id in (
-            select
-                publisher_id
-            from bias
-        )
-        AND child_id in (
-            select
-                publisher_id
-            from bias
-        )
-    """).df()
    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)

    x = pivot.values
    y = bias.sort_values('publisher_id').ordinal

-    data = DB.query(f"""
-        SELECT
-            p.id as publisher_id
-            ,pca.first
-            ,pca.second
-        FROM top.publisher_pca_onehot pca
-        JOIN top.publishers p
-        ON pca.publisher_id = p.id
-    """).df()
+    with connect() as db:
+        data = db.query(f"""
+            SELECT
+                p.id as publisher_id
+                ,pca.first
+                ,pca.second
+            FROM top.publisher_pca_onehot pca
+            JOIN top.publishers p
+            ON pca.publisher_id = p.id
+        """).df()



@ -235,11 +233,11 @@ def link_confusion():
    ConfusionMatrixDisplay.from_predictions(data['ordinal'], data['pred'], ax=ax)
    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
    ax.set(title="confusion matrix for link matrix kNN classifier", xticklabels=ticklabels, yticklabels=ticklabels)
-    plt.savefig(out_dir / filename)
+    plt.savefig(save_to)
    plt.close()
-    print(f"saved plot: {filename}")
+    print(f"saved plot: {save_to}")

-    ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
-    plt.savefig(out_dir / filename)
-    plt.close()
-    print(f"saved plot: {filename}")
+    # ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
+    # plt.savefig(out_dir / filename)
+    # plt.close()
+    # print(f"saved plot: {filename}")
--- a/src/plots/sentence.py
+++ b/src/plots/sentence.py
@ -1,5 +1,5 @@
 import click
-from data.main import connect
+from data.main import connect, paths
 import os
 from pathlib import Path
 import seaborn as sns
@ -7,57 +7,52 @@ import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd

-out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
-data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
-
@click.command('plot:sentence-pca')
 def sentence_pca():
-    filename = "embedding_sentence_pca.png"
-    DB = connect()
+    save_to = paths('figures') / "embedding_sentence_pca.png"

-    data = DB.query("""
-        SELECT
-            pca.first
-            ,pca.second
-            ,b.bias as label
-        FROM top.story_embeddings_pca pca
-        JOIN top.stories s
-        ON s.id = pca.story_id
-        JOIN top.publisher_bias pb
-        ON pb.publisher_id = s.publisher_id
-        JOIN bias_ratings b
-        ON b.id = pb.bias_id
-    """).df()
-    DB.close()
+    with connect() as db:
+        data = db.query("""
+            SELECT
+                pca.first
+                ,pca.second
+                ,b.bias as label
+            FROM top.story_embeddings_pca pca
+            JOIN top.stories s
+            ON s.id = pca.story_id
+            JOIN top.publisher_bias pb
+            ON pb.publisher_id = s.publisher_id
+            JOIN bias_ratings b
+            ON b.id = pb.bias_id
+        """).df()

    ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
    ax.set(title="pca components vs. bias label", xlabel="first component", ylabel="second component")
-    plt.savefig(out_path / filename)
+    plt.savefig(save_to)

@click.command('plot:avg-sentence-pca')
 def avg_sentence_pca():
-    filename = "avg_embedding_sentence_pca.png"
-    DB = connect()
+    save_to = paths('figures') / "avg_embedding_sentence_pca.png"

-    data = DB.query("""
-        SELECT
-            pca.first
-            ,pca.second
-            ,p.tld
-            ,b.bias as label
-        FROM top.publisher_embeddings_pca pca
-        JOIN top.publishers p
-        ON p.id = pca.publisher_id
-        JOIN top.publisher_bias pb
-        ON pb.publisher_id = p.id
-        JOIN bias_ratings b
-        ON b.id = pb.bias_id
-    """).df()
-    DB.close()
+    with connect() as db:
+        data = db.query("""
+            SELECT
+                pca.first
+                ,pca.second
+                ,p.tld
+                ,b.bias as label
+            FROM top.publisher_embeddings_pca pca
+            JOIN top.publishers p
+            ON p.id = pca.publisher_id
+            JOIN top.publisher_bias pb
+            ON pb.publisher_id = p.id
+            JOIN bias_ratings b
+            ON b.id = pb.bias_id
+        """).df()

    ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
    ax.set(title="avg. publisher embedding pca components vs. bias label", xlabel="first component", ylabel="second component")
-    plt.savefig(out_path / filename)
+    plt.savefig(save_to)

@click.command('plot:sentence-confusion')
 def sentence_confusion():
@ -65,32 +60,31 @@ def sentence_confusion():
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.metrics import ConfusionMatrixDisplay

-    filename = "sentence_confusion.png"
+    save_to = paths('figures') / "sentence_confusion.png"

-    embeddings = np.load(data_path / 'embeddings.npy')
-    embedding_ids = np.load(data_path / 'embedding_ids.npy')
+    embeddings = np.load(paths('data') / 'embeddings.npy')
+    embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
    ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()

-    DB = connect()
-    data = DB.query("""
-        SELECT
-            ids.index
-            ,s.id
-            ,b.ordinal
-        FROM ids
-        JOIN top.stories s
-        ON ids.story_id = s.id
-        JOIN top.publisher_bias pb
-        ON pb.publisher_id = s.publisher_id
-        JOIN bias_ratings b
-        ON b.id = pb.bias_id
-    """).df()
-    pub = DB.query("""
-        SELECT
-            *
-        FROM top.publishers
-    """).df()
-    DB.close()
+    with connect() as db:
+        data = db.query("""
+            SELECT
+                ids.index
+                ,s.id
+                ,b.ordinal
+            FROM ids
+            JOIN top.stories s
+            ON ids.story_id = s.id
+            JOIN top.publisher_bias pb
+            ON pb.publisher_id = s.publisher_id
+            JOIN bias_ratings b
+            ON b.id = pb.bias_id
+        """).df()
+        pub = db.query("""
+            SELECT
+                *
+            FROM top.publishers
+        """).df()

    train, test = train_test_split(data)
    train_x, train_y = embeddings[train['index']], train['ordinal']
@ -105,7 +99,7 @@ def sentence_confusion():
    ConfusionMatrixDisplay.from_predictions(test_y, pred, ax=ax)
    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
    ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
-    plt.savefig(out_path / filename)
+    plt.savefig(save_to)
    plt.close()

-    print(f"saved plot: {filename}")
+    print(f"saved plot: {save_to}")
--- a/src/plots/sentiment.py
+++ b/src/plots/sentiment.py
@ -1,138 +1,135 @@
 import click
-from data.main import connect
-import os
-from pathlib import Path
+from data.main import connect, paths, ticklabels
 import seaborn as sns
 import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-
-out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'

@click.command('plot:sentiment-over-time')
 def over_time():
-    filename = "sentiment_over_time.png"

-    DB = connect()
-    data = DB.sql("""
-        SELECT
-            avg(sent.class_id) as sentiment
-            ,s.published_at as date
-        FROM top.story_sentiments sent
-        JOIN top.stories s
-        ON s.id = sent.story_id
-        GROUP BY
-            s.published_at
-    """).df()
-    DB.close()
+    filename = "sentiment_over_time.png"
+    save_to = paths('figures') / filename
+
+    with connect() as db:
+        data = db.sql("""
+            SELECT
+                avg(sent.class_id) as sentiment
+                ,s.published_at as date
+            FROM top.story_sentiments sent
+            JOIN top.stories s
+            ON s.id = sent.story_id
+            GROUP BY
+                s.published_at
+        """).df()

    ax = sns.scatterplot(x=data['date'], y=data['sentiment'])
    ax.set(title="sentiment vs. time")
    plt.tight_layout()
-    plt.savefig(out_path / filename)
-    print(f"saved: {filename}")
+    plt.savefig(save_to)
+    plt.close()
+    print(f"saved: {save_to}")

@click.command('plot:bias-vs-sentiment-over-time')
 def bias_over_time():
+    """plot sentiment/bias vs. time"""
+
    filename = "bias_vs_sentiment_over_time.png"
+    save_to = paths('figures') / filename

-    DB = connect()
-    data = DB.sql("""
-        SELECT
-            avg(sent.class_id) as sentiment
-            ,date_trunc('yearweek', s.published_at) as date
-            --,b.ordinal as ordinal
-            ,b.bias
-        FROM top.story_sentiments sent
-        JOIN top.stories s
-        ON s.id = sent.story_id
-        JOIN publisher_bias pb
-        ON pb.publisher_id = s.publisher_id
-        JOIN bias_ratings b
-        ON b.id = pb.bias_id
-        GROUP BY
-            date_trunc('yearweek', s.published_at)
-            ,b.bias
-    """).df()
-    DB.close()
+    with connect() as db:
+        data = db.sql("""
+            with cte as (
+                SELECT
+                    avg(sent.class_id) as sentiment
+                    ,date_trunc('yearweek', s.published_at) as date
+                    ,p.bias
+                FROM story_sentiments sent
+                JOIN stories s
+                ON s.id = sent.story_id
+                JOIN mbfc.publisher_stories ps
+                ON ps.story_id = s.id
+                JOIN mbfc.publishers p
+                ON p.id = ps.publisher_id
+                WHERE p.ordinal != -1
+                GROUP BY
+                    date_trunc('yearweek', s.published_at)
+                    ,p.bias
+            )
+            SELECT
+                median(sentiment) OVER (PARTITION BY bias ORDER BY date DESC ROWS BETWEEN 0 PRECEDING AND 7 FOLLOWING) as sentiment
+                ,date
+                ,bias
+            FROM cte
+            WHERE year(date) not in (2005, 2023)
+        """).df()

-    order = ['left', 'left-center', 'center', 'right-center', 'right']
-    ax = sns.relplot(data, x='date', y='sentiment', col='bias', col_order=order)
+    #ax = sns.relplot(data, x='date', y='sentiment', col='bias', palette='rainbow', hue='bias', col_order=ticklabels())
+    ax = sns.lineplot(data, x='date', y='sentiment', palette='rainbow', hue='bias', hue_order=ticklabels())
+    plt.axhline(y=0.5, color='black', linestyle='--', label='neutral') 
+    ax.set(title='sentiment and bias vs. time', ylabel='8 week rolling avg. sentiment', xlabel='date')
    plt.tight_layout()
-    plt.savefig(out_path / filename)
+    plt.savefig(save_to)
    plt.close()
-    print(f"saved: {filename}")
+    print(f"saved: {save_to}")

@click.command('plot:sentiment-recent-winner')
 def bias_vs_recent_winner():
+    """plot bias vs. distance to election"""
+
    filename = "bias_vs_recent_winner.png"
+    save_to = paths('figures') / filename

-    DB = connect()
-    data = DB.sql("""
-        SELECT
-            e.days_away as days_away
-            ,b.ordinal
-            ,avg(sent.class_id) as sentiment
-            ,count(1) as stories
-        FROM top.stories s
-        JOIN top.story_sentiments sent
-        ON s.id = sent.story_id
-        JOIN election_distance e
-        ON e.publish_date = s.published_at
-        JOIN publisher_bias pb
-        ON pb.publisher_id = s.publisher_id
-        JOIN bias_ratings b
-        ON b.id = pb.bias_id
-        GROUP BY
-            e.days_away
-            ,b.ordinal
-    """).df()
-    DB.close()
-    data
+    with connect() as db:
+        data = db.sql("""
+            SELECT
+                round(e.days_away, -1) as days_away
+                ,p.bias
+                ,avg(sent.class_id) as sentiment
+                ,count(1) as stories
+            FROM stories s
+            JOIN story_sentiments sent
+            ON s.id = sent.story_id
+            JOIN election_distance e
+            ON e.publish_date = s.published_at
+            JOIN mbfc.publisher_stories ps
+            ON ps.story_id = s.id
+            JOIN mbfc.publishers p
+            ON p.id = ps.publisher_id
+            GROUP BY
+                round(e.days_away, -1)
+                ,p.bias
+        """).df()

-    ax = sns.scatterplot(x=data['days_away'], y=data['sentiment'], hue=data['ordinal'])
+    ax = sns.scatterplot(data, x='days_away', y='sentiment', hue='bias', hue_order=ticklabels(), palette='rainbow')
    ax.set(title="sentiment vs. days to nearest election", xlabel="days to nearest election", ylabel="average title seniment")
    plt.tight_layout()
-    plt.savefig(out_path / filename)
+    plt.savefig(save_to)
    plt.close()
-
-    print(f"saved: {filename}")
+    print(f"saved: {save_to}")

@click.command('plot:sentiment-hist')
 def sentiment_hist():
+
    filename = "sentiment_hist.png"
+    save_to = paths('figures') / filename

-    DB = connect()
+    with connect() as db:
+        data = db.sql("""
+            SELECT
+                p.bias
+                ,count(1) as stories
+            FROM stories s
+            JOIN mbfc.publisher_stories ps
+            ON ps.story_id = s.id
+            JOIN mbfc.publishers p
+            ON p.id = ps.publisher_id
+            WHERE p.ordinal != -1
+            GROUP BY
+                p.bias
+        """).df()

-    DB.query("""
-        select
-            sent.label
-            ,count(distinct s.id) as stories
-            ,count(distinct s.publisher_id) as publishers
-        from top.story_sentiments sent
-        join top.stories s
-        on s.id = sent.story_id
-        group by
-            sent.label
-    """).df().to_markdown(index=False)
-
-    data = DB.sql("""
-        SELECT
-            b.ordinal
-            ,count(1) as stories
-        FROM stories s
-        JOIN publisher_bias pb
-        ON pb.publisher_id = s.publisher_id
-        JOIN bias_ratings b
-        ON b.id = pb.bias_id
-        GROUP BY
-            b.ordinal
-    """).df()
-    DB.close()
-
-    ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
-    ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
-    ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
+    ax = sns.barplot(data, x='bias', y='stories', hue='bias', palette='rainbow')
+    ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
    plt.tight_layout()
-    plt.savefig(out_path / filename)
-    print(f"saved: {filename}")
+    plt.savefig(save_to)
+    plt.close()
+    print(f"saved: {save_to}")
--- a/src/selection.py
+++ b/src/selection.py
@ -1,48 +0,0 @@
-from data.main import connect
-import pandas as pd
-import numpy as np
-
-DB = connect()
-edges = DB.query("""
-    select
-    *
-    from link_edges
-""").df()
-DB.close()
-
-edges
-
-adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
-select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
-
-DB = connect()
-DB.query("create schema top")
-
-DB.query("""
-    CREATE OR REPLACE TABLE top.publishers AS
-    SELECT
-        p.*
-    FROM publishers p
-    JOIN select_publishers s
-    ON s.publisher_id = p.id
-""")
-
-DB.query("""
-    CREATE OR REPLACE TABLE top.stories AS
-    SELECT
-        s.*
-    FROM stories s
-    JOIN top.publishers p
-    ON s.publisher_id = p.id
-    WHERE year(s.published_at) >= 2006
-    AND year(s.published_at) < 2023
-""")
-
-DB.query("""
-    CREATE OR REPLACE TABLE top.related_stories AS
-    SELECT
-        r.*
-    FROM top.stories s
-    JOIN related_stories r
-    ON s.id = r.parent_id
-""")
--- a/src/sentence.py
+++ b/src/sentence.py
@ -1,7 +1,7 @@
 from transformers import AutoTokenizer, AutoModel
 import torch
 import torch.nn.functional as F
-from data.main import connect, data_dir
+from data.main import connect, paths
 import os
 from pathlib import Path
 import numpy as np
@ -62,7 +62,7 @@ def embed(chunks):
    ids = np.concatenate(embedding_ids)

    # save embeddings
-    save_to = data_dir() / 'embeddings.npy'
+    save_to = paths('data') / 'embeddings.npy'
    np.save(save_to, embeddings)
    print(f"embeddings saved: {save_to}")

@ -75,29 +75,28 @@ def embed(chunks):
@click.command('sentence:create-avg-pca-table')
 def create_avg_pca_table():
    from sklearn.decomposition import PCA
-    data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))


-    embeddings = np.load(data_path / 'embeddings.npy')
-    embedding_ids = np.load(data_path / 'embedding_ids.npy')
+    embeddings = np.load(paths('data') / 'embeddings.npy')
+    embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
    ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()

-    DB = connect()
-    data = DB.query("""
-        SELECT
-            ids.index
-            ,s.id
-            ,s.publisher_id
-            ,b.ordinal
-        FROM ids
-        JOIN top.stories s
-        ON ids.story_id = s.id
-        JOIN top.publisher_bias pb
-        ON pb.publisher_id = s.publisher_id
-        JOIN bias_ratings b
-        ON b.id = pb.bias_id
-    """).df()
-    DB.close()
+    
+    with connect() as db:
+        data = db.query("""
+            SELECT
+                ids.index
+                ,s.id
+                ,s.publisher_id
+                ,b.ordinal
+            FROM ids
+            JOIN top.stories s
+            ON ids.story_id = s.id
+            JOIN top.publisher_bias pb
+            ON pb.publisher_id = s.publisher_id
+            JOIN bias_ratings b
+            ON b.id = pb.bias_id
+        """).df()

    results = []
    for publisher_id, group in data.groupby(['publisher_id']):
@ -115,47 +114,45 @@ def create_avg_pca_table():
    results['second'] = pred[:, 1]

    table_name = "top.publisher_embeddings_pca"
-    DB = connect()
-    DB.query(f"""
-        CREATE OR REPLACE TABLE {table_name} AS
-        SELECT
-            results.publisher_id as publisher_id
-            ,results.first as first
-            ,results.second as second
-        FROM results
-    """)
-    DB.close()
+    with connect() as db:
+        db.query(f"""
+            CREATE OR REPLACE TABLE {table_name} AS
+            SELECT
+                results.publisher_id as publisher_id
+                ,results.first as first
+                ,results.second as second
+            FROM results
+        """)
+
    print(f"created {table_name}")


@click.command('sentence:create-pca-table')
 def create_pca_table():
    from sklearn.decomposition import PCA
-    data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))

-    embeddings = np.load(data_path / 'embeddings.npy')
-    embedding_ids = np.load(data_path / 'embedding_ids.npy')
+    embeddings = np.load(path('data') / 'embeddings.npy')
+    embedding_ids = np.load(path('data') / 'embedding_ids.npy')

-    DB = connect()
-    data = DB.query("""
-        SELECT
-            ids.index
-            ,s.id
-            ,b.ordinal
-        FROM ids
-        JOIN top.stories s
-        ON ids.story_id = s.id
-        JOIN top.publisher_bias pb
-        ON pb.publisher_id = s.publisher_id
-        JOIN bias_ratings b
-        ON b.id = pb.bias_id
-    """).df()
-    pub = DB.query("""
-        SELECT
-            *
-        FROM top.publishers
-    """).df()
-    DB.close()
+    with connect() as db:
+        data = db.query("""
+            SELECT
+                ids.index
+                ,s.id
+                ,b.ordinal
+            FROM ids
+            JOIN top.stories s
+            ON ids.story_id = s.id
+            JOIN top.publisher_bias pb
+            ON pb.publisher_id = s.publisher_id
+            JOIN bias_ratings b
+            ON b.id = pb.bias_id
+        """).df()
+        pub = db.query("""
+            SELECT
+                *
+            FROM top.publishers
+        """).df()

    x = embeddings[data['index']]
    y = data['ordinal'].to_numpy().reshape(-1, 1)
@ -166,42 +163,41 @@ def create_pca_table():

    table_name = f"top.story_embeddings_pca"

-    DB = connect()
-    DB.query(f"""
-        CREATE OR REPLACE TABLE {table_name} AS
-        SELECT
-            data.id as story_id
-            ,data.first as first
-            ,data.second as second
-        FROM data
-    """)
-    DB.close()
+    with connect() as db:
+        db.query(f"""
+            CREATE OR REPLACE TABLE {table_name} AS
+            SELECT
+                data.id as story_id
+                ,data.first as first
+                ,data.second as second
+            FROM data
+        """)
+
    print(f"created {table_name}")

@click.command('sentence:create-svm-table')
 def create_svm_table():
    from sklearn import svm
    from sklearn.linear_model import SGDClassifier
-    data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))

-    embeddings = np.load(data_path / 'embeddings.npy')
-    embedding_ids = np.load(data_path / 'embedding_ids.npy')
+    embeddings = np.load(paths('data') / 'embeddings.npy')
+    embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
    ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()

-    DB = connect()
-    data = DB.query("""
-        SELECT
-            ids.index
-            ,s.id
-            ,b.ordinal
-        FROM ids
-        JOIN top.stories s
-        ON ids.story_id = s.id
-        JOIN top.publisher_bias pb
-        ON pb.publisher_id = s.publisher_id
-        JOIN bias_ratings b
-        ON b.id = pb.bias_id
-    """).df()
+    with connect() as db:
+        data = db.query("""
+            SELECT
+                ids.index
+                ,s.id
+                ,b.ordinal
+            FROM ids
+            JOIN top.stories s
+            ON ids.story_id = s.id
+            JOIN top.publisher_bias pb
+            ON pb.publisher_id = s.publisher_id
+            JOIN bias_ratings b
+            ON b.id = pb.bias_id
+        """).df()

    x = embeddings[data['index']]
    #y = data['ordinal'].to_numpy().reshape(-1, 1)