better cli command definitions.

2023-04-22 18:19:14 -07:00 · 2023-04-22 18:19:14 -07:00 · 6dba519443
parent 086d858c3b
commit 6dba519443
6 changed files with 222 additions and 65 deletions
--- a/src/bias.py
+++ b/src/bias.py
@ -6,10 +6,6 @@ from pathlib import Path
 import os
 import csv
@click.group()
 def cli() -> None:
    ...
 def map(rating:str) -> int:
    mapping = {
        'right' : 0,
@ -22,7 +18,7 @@ def map(rating:str) -> int:
    return mapping[rating]
-@cli.command()
+@click.command(name="bias:load")
 def load() -> None:
    DB = connect()
    DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
@ -35,8 +31,8 @@ def load() -> None:
            ,b.*
        from read_csv_auto('{f}') b
    """)
-@cli.command()
+@click.command(name="bias:normalize")
-def join() -> None:
+def normalize() -> None:
    DB = connect()
    DB.sql("""
@ -101,7 +97,7 @@ def join() -> None:
        where publisher ilike '%CNN%'
        """)
-@cli.command()
+@click.command(name='bias:debug')
 def debug() -> None:
    DB = connect()
    DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
@ -145,8 +141,8 @@ def debug() -> None:
    outlets
-@cli.command()
+@click.command(name='bias:parse')
-def parse_html() -> None:
+def parse() -> None:
    """parse the save html page of allslides.com bias ratings into a normalized csv file"""
    DB = connect()
    DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
@ -175,6 +171,3 @@ def parse_html() -> None:
        ratings.append(rating)
    df = pd.DataFrame(ratings)
    df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
 if __name__ == "__main__":
    cli()
--- a/src/broken_links.py
+++ b/src/broken_links.py
@ -1,21 +1,43 @@
 import requests
 import click
 from data import connect
 import seaborn as sns
 import matplotlib.pyplot as plt
 import click
-DB = connect()
+from data import connect
-DB.sql("""
+@click.command(name="broken:crawl")
-    select 
+def crawl():
-        id
+    """crawl story urls checking for link rot or redirects."""
-        ,url
+    DB = connect()
    from stories 
 """)
-DB.sql("""
+    urls = DB.query("""
-    describe stories 
+        select 
-""")
+            id
            ,url
        from stories 
        order by published_at asc
        limit 5
    """).fetchall()
-sns.histplot(x=hist['cnt'])
+    DB.close()
-plt.show()
+
    story_id, url = urls[1]
    # url
    responses = []
    for story_id, url in urls:
        out = {'story_id' : story_id, 'final_url' : url, 'timeout' : 0, 'status_code' : 200, 'content_length' : 0}
        try:
            response = requests.get(url, verify=False, timeout=10)
            if len(response.history) > 1:
                out['redirect'] = 1
            if url != response.url:
                out['final_url'] = response.url
            out['status_code'] = response.status_code
            out['content_length'] = len(response.content)
        except requests.exceptions.ReadTimeout as e:
            print(f"timeout: {url}")
            out['timeout'] = 1
        responses.append(out)
    sns.histplot(x=hist['cnt'])
    plt.show()
--- a/src/cli.py
+++ b/src/cli.py
@ -62,4 +62,21 @@ def plot(name: PlotName, output: Path):
    plt.savefig(output)
 if __name__ == "__main__":
    import scrape
    cli.add_command(scrape.download)
    cli.add_command(scrape.parse)
    cli.add_command(scrape.load)
    cli.add_command(scrape.normalize)
    import word
    # cli.add_command(word.distance)
    # cli.add_command(word.train)
    cli.add_command(word.embed)
    cli.add_command(word.max_sequence)
    import bias
    cli.add_command(bias.parse)
    cli.add_command(bias.load)
    cli.add_command(bias.normalize)
    # import mine
    # cli.add_command(mine.embeddings)
    # cli.add_command(mine.cluster)
    cli()
--- a/src/mine.py
+++ b/src/mine.py
@ -0,0 +1,99 @@
 from data import data_dir, connect
 import numpy as np
 import sklearn
 from sklearn.cluster import MiniBatchKMeans
@click.command(name="mine:embeddings")
 def embeddings():
    data = np.load(data_dir() / "embeddings.npy")
    kmeans = MiniBatchKMeans(n_clusters=5,
                             random_state=0,
                             batch_size=6,
                             n_init="auto")
    model = kmeans.fit(data)
    clusters = model.predict(data)
    db = connect()
    stories = db.sql("""
        select
            id
        from stories
        order by id desc
    """).df()
    stories['cluster'] = clusters
    db.execute("drop table clusters")
    db.execute("CREATE TABLE clusters (story_id BIGINT, cluster integer)")
    db.append("clusters", stories)
@click.command(name="mine:cluster")
 def cluster():
    import pandas as pd
    pd.set_option('display.max_rows', 100)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)
    df = db.sql("""
        select
            s.publisher
            ,c.cluster
            ,count(1) as total
        from clusters c
        join stories s
        on s.id = c.story_id
        group by
        s.publisher, c.cluster
    """).df()
    pivoted = df.pivot(index='publisher', columns='cluster', values='total').sort_values([0], ascending=False)
    pivoted[:25]
    db.sql("""
        select
            publisher
            ,title
        from clusters c
        join stories s
        on s.id = c.story_id
        where c.cluster = 0
    """)
    len(stories)
    data.shape
 def main():
    db.sql("""
        select
            count(distinct publisher)
        from stories
    """)
    db.sql("""
        with cte as (
        select
            distinct title
        from stories
        )
        select
            max(length(title)) as max
            ,min(length(title)) as min
            ,avg(length(title)) as avg
            ,sum(length(title)) as characters
        from cte
    """).fetchall()
    """
    let's calculate the size of the word embeddings stored as a list in the database
    db.sql("""
        with cte as (
        select
            distinct title
        from stories
        )
    db.sql("""
        select
            count(distinct url)
        from stories
    """)
--- a/src/scrape.py
+++ b/src/scrape.py
@ -8,36 +8,33 @@ from data import data_dir, connect
 from lxml import etree
 import pandas as pd
-@click.group()
+@click.command(name='scrape:load')
-def cli():
+@click.option('--directory', type=Path, default=data_dir(), show_default=True)
-    ...
+@click.option('--database', type=Path, default=data_dir() / "stories.duckdb", show_default=True)
@cli.command()
@click.option('--directory', type=Path, default=data_dir())
@click.option('--database', type=Path, default=data_dir() / "stories.duckdb")
 def load(directory, database):
    stories = directory / "stories.csv"
    related = directory / "related.csv"
    db = connect()
    db.sql(f"""
-        CREATE TABLE stories AS
+        CREATE OR REPLACE TABLE stories AS
        SELECT 
            * 
        FROM read_csv_auto('{stories}')
    """)
    db.sql(f"""
-        CREATE TABLE related_stories AS
+        CREATE OR REPLACE TABLE related_stories AS
        SELECT 
            * 
        FROM read_csv_auto('{related}')
    """)
    db.close()
-@cli.command()
+@click.command(name='scrape:download')
-@click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum")
+@click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum", show_default=True)
 def download(output_dir):
    """download every day from 01/10/2005 to today from memeorandum.com"""
    day = timedelta(days=1)
    cur = date(2005, 10, 1)
    end = date.today()
@ -58,7 +55,7 @@ def download(output_dir):
            f.write(r.text)
-@cli.command()
+@click.command(name='scrape:parse')
@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum", show_default=True)
@click.option('-o', '--output_dir', type=Path, default=data_dir(), show_default=True)
 def parse(directory, output_dir):
@ -83,6 +80,7 @@ def parse(directory, output_dir):
        # item = items[0]
        for item in items:
            out = dict()
            out['published_at'] = date
            citation = item.xpath('./cite')
            if not citation:
                continue
@ -122,23 +120,55 @@ def parse(directory, output_dir):
                another['parent_id'] = item_id
                others.append(another)
    df = pd.DataFrame(published)
-    df.to_csv(output_dir / 'stories_v2.csv', sep='|', index=False)
+    df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
    df = pd.DataFrame(others)
-    df.to_csv(output_dir / 'related_v2.csv', sep='|', index=False)
+    df.to_csv(output_dir / 'related.csv', sep='|', index=False)
-@cli.command()
+@click.command(name='scrape:normalize')
 def normalize():
    """fix database after load. remove duplicates. create publishers."""
    DB = connect()
    DB.sql("""
-        create table publishers as
+        DELETE FROM stories
-        select
+        WHERE id IN (
-            row_number() over(order by publisher) as id
+            WITH cte AS (
-            ,publisher
+                SELECT
-            ,publisher_url
+                    url
-        from stories
+                    ,id
-        group by publisher, publisher_url
+                    ,ROW_NUMBER() OVER(PARTITION BY url) AS url_ctn
                    ,ROW_NUMBER() OVER(PARTITION BY title) AS title_ctn
                FROM stories
            )
            SELECT
                id
            FROM cte
            WHERE url_ctn > 1
            OR title_ctn > 1
        )
    """)
    DB.sql("""
        CREATE OR REPLACE TABLE publishers AS
        with cte as (
            SELECT
                s.publisher
                ,s.publisher_url
            FROM stories s
            GROUP BY 
                s.publisher
                ,s.publisher_url
        ), together AS (
            SELECT
                COALESCE(cte.publisher, r.publisher) AS publisher
                ,cte.publisher_url
            FROM cte
            FULL OUTER JOIN related_stories r
            ON cte.publisher = r.publisher
        )
        SELECT
            ROW_NUMBER() OVER() as id
            ,t.*
        FROM together t
        GROUP BY
            publisher
            ,publisher_url
    """)
 if __name__ == "__main__":
    cli()
--- a/src/word.py
+++ b/src/word.py
@ -6,12 +6,9 @@ from tqdm import tqdm
 import torch
 from pathlib import Path
-@click.group()
+@click.command(name="word:max-sequence")
 def cli():
    ...
@cli.command()
 def max_sequence():
    """calculate the maximum token length given the story titles"""
    db = connect()
    longest = db.sql("""
        select
@ -20,16 +17,19 @@ def max_sequence():
        order by length(title) desc
        limit 5000
    """).df()
    db.close()
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    tokens = tokenizer(longest['title'].to_list())
    print(f"{max([len(x) for x in tokens['input_ids']])}")
-@cli.command()
+@click.command(name="word:train")
 def train():
    """TODO"""
    table = from_db(Data.Titles)
    n_classes = 10
@click.command(name="word:embed")
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
@click.option('--embedding_dest', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'sequence_embeddings.npy'), show_default=True)
@click.option('--token_dest', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'sequence_tokens.npy'), show_default=True)
@ -50,6 +50,7 @@ def embed(chunks, embedding_dest, token_dest):
        from stories
        order by id desc
    """).df()
    db.close()
    # normalize text
    table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
@ -61,7 +62,6 @@ def embed(chunks, embedding_dest, token_dest):
        tokens = tokens.to(device)
        with torch.no_grad():
            outputs = model(**tokens)
            #outputs = outputs.to(torch.device('cpu'))
        return tokens.to(torch.device('cpu')), outputs.last_hidden_state.to(torch.device('cpu'))
    tokens = []
@ -80,7 +80,7 @@ def embed(chunks, embedding_dest, token_dest):
    np.save(embedding_dest, embeddings)
    np.save(token_dest, tokens)
-@cli.command()
+@click.command(name="word:distance")
 def distance():
    """TODO: measure distance between sequence embeddings"""
    distances = distance.cdist(classes, classes, 'euclidean')
@ -88,7 +88,3 @@ def distance():
    min_index = (np.argmin(distances))
    closest = np.unravel_index(min_index, distances.shape)
    distances.flatten().shape
 if __name__ == "__main__":
    cli()