From 6dba51944311810e4f82ee4c99d196136c4cdd23 Mon Sep 17 00:00:00 2001
From: matt <git@publicmatt.com>
Date: Sat, 22 Apr 2023 18:19:14 -0700
Subject: [PATCH] better cli command definitions.

---
 src/bias.py         | 19 +++------
 src/broken_links.py | 50 ++++++++++++++++-------
 src/cli.py          | 17 ++++++++
 src/mine.py         | 99 +++++++++++++++++++++++++++++++++++++++++++++
 src/scrape.py       | 82 +++++++++++++++++++++++++------------
 src/word.py         | 20 ++++-----
 6 files changed, 222 insertions(+), 65 deletions(-)
 create mode 100644 src/mine.py

diff --git a/src/bias.py b/src/bias.py
index b575b2d..b972d42 100644
--- a/src/bias.py
+++ b/src/bias.py
@@ -6,10 +6,6 @@ from pathlib import Path
 import os
 import csv
 
-@click.group()
-def cli() -> None:
-    ...
-
 def map(rating:str) -> int:
     mapping = {
         'right' : 0,
@@ -22,7 +18,7 @@ def map(rating:str) -> int:
     return mapping[rating]
 
 
-@cli.command()
+@click.command(name="bias:load")
 def load() -> None:
     DB = connect()
     DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
@@ -35,8 +31,8 @@ def load() -> None:
             ,b.*
         from read_csv_auto('{f}') b
     """)
-@cli.command()
-def join() -> None:
+@click.command(name="bias:normalize")
+def normalize() -> None:
     DB = connect()
 
     DB.sql("""
@@ -101,7 +97,7 @@ def join() -> None:
         where publisher ilike '%CNN%'
         """)
 
-@cli.command()
+@click.command(name='bias:debug')
 def debug() -> None:
     DB = connect()
     DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
@@ -145,8 +141,8 @@ def debug() -> None:
 
     outlets
 
-@cli.command()
-def parse_html() -> None:
+@click.command(name='bias:parse')
+def parse() -> None:
     """parse the save html page of allslides.com bias ratings into a normalized csv file"""
     DB = connect()
     DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
@@ -175,6 +171,3 @@ def parse_html() -> None:
         ratings.append(rating)
     df = pd.DataFrame(ratings)
     df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
-
-if __name__ == "__main__":
-    cli()
diff --git a/src/broken_links.py b/src/broken_links.py
index 110e691..aa650d7 100644
--- a/src/broken_links.py
+++ b/src/broken_links.py
@@ -1,21 +1,43 @@
 import requests
-import click
-from data import connect
 import seaborn as sns
 import matplotlib.pyplot as plt
+import click
 
-DB = connect()
+from data import connect
 
-DB.sql("""
-    select 
-        id
-        ,url
-    from stories 
-""")
+@click.command(name="broken:crawl")
+def crawl():
+    """crawl story urls checking for link rot or redirects."""
+    DB = connect()
 
-DB.sql("""
-    describe stories 
-""")
+    urls = DB.query("""
+        select 
+            id
+            ,url
+        from stories 
+        order by published_at asc
+        limit 5
+    """).fetchall()
 
-sns.histplot(x=hist['cnt'])
-plt.show()
+    DB.close()
+
+    story_id, url = urls[1]
+    # url
+    responses = []
+    for story_id, url in urls:
+        out = {'story_id' : story_id, 'final_url' : url, 'timeout' : 0, 'status_code' : 200, 'content_length' : 0}
+        try:
+            response = requests.get(url, verify=False, timeout=10)
+            if len(response.history) > 1:
+                out['redirect'] = 1
+            if url != response.url:
+                out['final_url'] = response.url
+            out['status_code'] = response.status_code
+            out['content_length'] = len(response.content)
+        except requests.exceptions.ReadTimeout as e:
+            print(f"timeout: {url}")
+            out['timeout'] = 1
+        responses.append(out)
+
+    sns.histplot(x=hist['cnt'])
+    plt.show()
diff --git a/src/cli.py b/src/cli.py
index 3e1464d..4391de0 100644
--- a/src/cli.py
+++ b/src/cli.py
@@ -62,4 +62,21 @@ def plot(name: PlotName, output: Path):
     plt.savefig(output)
 
 if __name__ == "__main__":
+    import scrape
+    cli.add_command(scrape.download)
+    cli.add_command(scrape.parse)
+    cli.add_command(scrape.load)
+    cli.add_command(scrape.normalize)
+    import word
+    # cli.add_command(word.distance)
+    # cli.add_command(word.train)
+    cli.add_command(word.embed)
+    cli.add_command(word.max_sequence)
+    import bias
+    cli.add_command(bias.parse)
+    cli.add_command(bias.load)
+    cli.add_command(bias.normalize)
+    # import mine
+    # cli.add_command(mine.embeddings)
+    # cli.add_command(mine.cluster)
     cli()
diff --git a/src/mine.py b/src/mine.py
new file mode 100644
index 0000000..3bc74dd
--- /dev/null
+++ b/src/mine.py
@@ -0,0 +1,99 @@
+from data import data_dir, connect
+import numpy as np
+import sklearn
+from sklearn.cluster import MiniBatchKMeans
+
+
+@click.command(name="mine:embeddings")
+def embeddings():
+    data = np.load(data_dir() / "embeddings.npy")
+    kmeans = MiniBatchKMeans(n_clusters=5,
+                             random_state=0,
+                             batch_size=6,
+                             n_init="auto")
+    model = kmeans.fit(data)
+    clusters = model.predict(data)
+
+    db = connect()
+
+    stories = db.sql("""
+        select
+            id
+        from stories
+        order by id desc
+    """).df()
+    stories['cluster'] = clusters
+
+    db.execute("drop table clusters")
+    db.execute("CREATE TABLE clusters (story_id BIGINT, cluster integer)")
+    db.append("clusters", stories)
+
+@click.command(name="mine:cluster")
+def cluster():
+
+    import pandas as pd
+    pd.set_option('display.max_rows', 100)
+    pd.set_option('display.max_columns', 500)
+    pd.set_option('display.width', 1000)
+    df = db.sql("""
+        select
+            s.publisher
+            ,c.cluster
+            ,count(1) as total
+        from clusters c
+        join stories s
+        on s.id = c.story_id
+        group by
+        s.publisher, c.cluster
+    """).df()
+
+    pivoted = df.pivot(index='publisher', columns='cluster', values='total').sort_values([0], ascending=False)
+    pivoted[:25]
+
+    db.sql("""
+        select
+            publisher
+            ,title
+        from clusters c
+        join stories s
+        on s.id = c.story_id
+        where c.cluster = 0
+    """)
+
+    len(stories)
+    data.shape
+
+def main():
+    db.sql("""
+        select
+            count(distinct publisher)
+        from stories
+    """)
+
+    db.sql("""
+        with cte as (
+        select
+            distinct title
+        from stories
+        )
+        select
+            max(length(title)) as max
+            ,min(length(title)) as min
+            ,avg(length(title)) as avg
+            ,sum(length(title)) as characters
+        from cte
+    """).fetchall()
+    """
+    let's calculate the size of the word embeddings stored as a list in the database
+    db.sql("""
+        with cte as (
+        select
+            distinct title
+        from stories
+        )
+
+    db.sql("""
+        select
+            count(distinct url)
+        from stories
+    """)
diff --git a/src/scrape.py b/src/scrape.py
index 8950b25..66e96d0 100644
--- a/src/scrape.py
+++ b/src/scrape.py
@@ -8,36 +8,33 @@ from data import data_dir, connect
 from lxml import etree
 import pandas as pd
 
-@click.group()
-def cli():
-    ...
-
-@cli.command()
-@click.option('--directory', type=Path, default=data_dir())
-@click.option('--database', type=Path, default=data_dir() / "stories.duckdb")
+@click.command(name='scrape:load')
+@click.option('--directory', type=Path, default=data_dir(), show_default=True)
+@click.option('--database', type=Path, default=data_dir() / "stories.duckdb", show_default=True)
 def load(directory, database):
     stories = directory / "stories.csv"
     related = directory / "related.csv"
     db = connect()
 
     db.sql(f"""
-        CREATE TABLE stories AS
+        CREATE OR REPLACE TABLE stories AS
         SELECT 
             * 
         FROM read_csv_auto('{stories}')
     """)
 
     db.sql(f"""
-        CREATE TABLE related_stories AS
+        CREATE OR REPLACE TABLE related_stories AS
         SELECT 
             * 
         FROM read_csv_auto('{related}')
     """)
     db.close()
 
-@cli.command()
-@click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum")
+@click.command(name='scrape:download')
+@click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum", show_default=True)
 def download(output_dir):
+    """download every day from 01/10/2005 to today from memeorandum.com"""
     day = timedelta(days=1)
     cur = date(2005, 10, 1)
     end = date.today()
@@ -58,7 +55,7 @@ def download(output_dir):
             f.write(r.text)
 
 
-@cli.command()
+@click.command(name='scrape:parse')
 @click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum", show_default=True)
 @click.option('-o', '--output_dir', type=Path, default=data_dir(), show_default=True)
 def parse(directory, output_dir):
@@ -83,6 +80,7 @@ def parse(directory, output_dir):
         # item = items[0]
         for item in items:
             out = dict()
+            out['published_at'] = date
             citation = item.xpath('./cite')
             if not citation:
                 continue
@@ -122,23 +120,55 @@ def parse(directory, output_dir):
                 another['parent_id'] = item_id
                 others.append(another)
     df = pd.DataFrame(published)
-    df.to_csv(output_dir / 'stories_v2.csv', sep='|', index=False)
+    df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
     df = pd.DataFrame(others)
-    df.to_csv(output_dir / 'related_v2.csv', sep='|', index=False)
+    df.to_csv(output_dir / 'related.csv', sep='|', index=False)
 
-@cli.command()
+@click.command(name='scrape:normalize')
 def normalize():
+    """fix database after load. remove duplicates. create publishers."""
     DB = connect()
     DB.sql("""
-        create table publishers as
-        select
-            row_number() over(order by publisher) as id
-            ,publisher
-            ,publisher_url
-        from stories
-        group by publisher, publisher_url
+        DELETE FROM stories
+        WHERE id IN (
+            WITH cte AS (
+                SELECT
+                    url
+                    ,id
+                    ,ROW_NUMBER() OVER(PARTITION BY url) AS url_ctn
+                    ,ROW_NUMBER() OVER(PARTITION BY title) AS title_ctn
+                FROM stories
+            )
+            SELECT
+                id
+            FROM cte
+            WHERE url_ctn > 1
+            OR title_ctn > 1
+        )
+    """)
+    DB.sql("""
+        CREATE OR REPLACE TABLE publishers AS
+        with cte as (
+            SELECT
+                s.publisher
+                ,s.publisher_url
+            FROM stories s
+            GROUP BY 
+                s.publisher
+                ,s.publisher_url
+        ), together AS (
+            SELECT
+                COALESCE(cte.publisher, r.publisher) AS publisher
+                ,cte.publisher_url
+            FROM cte
+            FULL OUTER JOIN related_stories r
+            ON cte.publisher = r.publisher
+        )
+        SELECT
+            ROW_NUMBER() OVER() as id
+            ,t.*
+        FROM together t
+        GROUP BY
+            publisher
+            ,publisher_url
     """)
-
-
-if __name__ == "__main__":
-    cli()
diff --git a/src/word.py b/src/word.py
index b78e8ce..490e8db 100644
--- a/src/word.py
+++ b/src/word.py
@@ -6,12 +6,9 @@ from tqdm import tqdm
 import torch
 from pathlib import Path
 
-@click.group()
-def cli():
-    ...
-
-@cli.command()
+@click.command(name="word:max-sequence")
 def max_sequence():
+    """calculate the maximum token length given the story titles"""
     db = connect()
     longest = db.sql("""
         select
@@ -20,16 +17,19 @@ def max_sequence():
         order by length(title) desc
         limit 5000
     """).df()
+    db.close()
 
     tokenizer = AutoTokenizer.from_pretrained("roberta-base")
     tokens = tokenizer(longest['title'].to_list())
     print(f"{max([len(x) for x in tokens['input_ids']])}")
 
-@cli.command()
+@click.command(name="word:train")
 def train():
+    """TODO"""
     table = from_db(Data.Titles)
     n_classes = 10
 
+@click.command(name="word:embed")
 @click.option('-c', '--chunks', type=int, default=5000, show_default=True)
 @click.option('--embedding_dest', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'sequence_embeddings.npy'), show_default=True)
 @click.option('--token_dest', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'sequence_tokens.npy'), show_default=True)
@@ -50,6 +50,7 @@ def embed(chunks, embedding_dest, token_dest):
         from stories
         order by id desc
     """).df()
+    db.close()
 
     # normalize text
     table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
@@ -61,7 +62,6 @@ def embed(chunks, embedding_dest, token_dest):
         tokens = tokens.to(device)
         with torch.no_grad():
             outputs = model(**tokens)
-            #outputs = outputs.to(torch.device('cpu'))
         return tokens.to(torch.device('cpu')), outputs.last_hidden_state.to(torch.device('cpu'))
 
     tokens = []
@@ -80,7 +80,7 @@ def embed(chunks, embedding_dest, token_dest):
     np.save(embedding_dest, embeddings)
     np.save(token_dest, tokens)
 
-@cli.command()
+@click.command(name="word:distance")
 def distance():
     """TODO: measure distance between sequence embeddings"""
     distances = distance.cdist(classes, classes, 'euclidean')
@@ -88,7 +88,3 @@ def distance():
     min_index = (np.argmin(distances))
     closest = np.unravel_index(min_index, distances.shape)
     distances.flatten().shape
-
-
-if __name__ == "__main__":
-    cli()