diff --git a/src/bias.py b/src/bias.py index b575b2d..b972d42 100644 --- a/src/bias.py +++ b/src/bias.py @@ -6,10 +6,6 @@ from pathlib import Path import os import csv -@click.group() -def cli() -> None: - ... - def map(rating:str) -> int: mapping = { 'right' : 0, @@ -22,7 +18,7 @@ def map(rating:str) -> int: return mapping[rating] -@cli.command() +@click.command(name="bias:load") def load() -> None: DB = connect() DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) @@ -35,8 +31,8 @@ def load() -> None: ,b.* from read_csv_auto('{f}') b """) -@cli.command() -def join() -> None: +@click.command(name="bias:normalize") +def normalize() -> None: DB = connect() DB.sql(""" @@ -101,7 +97,7 @@ def join() -> None: where publisher ilike '%CNN%' """) -@cli.command() +@click.command(name='bias:debug') def debug() -> None: DB = connect() DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) @@ -145,8 +141,8 @@ def debug() -> None: outlets -@cli.command() -def parse_html() -> None: +@click.command(name='bias:parse') +def parse() -> None: """parse the save html page of allslides.com bias ratings into a normalized csv file""" DB = connect() DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) @@ -175,6 +171,3 @@ def parse_html() -> None: ratings.append(rating) df = pd.DataFrame(ratings) df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC) - -if __name__ == "__main__": - cli() diff --git a/src/broken_links.py b/src/broken_links.py index 110e691..aa650d7 100644 --- a/src/broken_links.py +++ b/src/broken_links.py @@ -1,21 +1,43 @@ import requests -import click -from data import connect import seaborn as sns import matplotlib.pyplot as plt +import click -DB = connect() +from data import connect -DB.sql(""" - select - id - ,url - from stories -""") +@click.command(name="broken:crawl") +def crawl(): + """crawl story urls checking for link rot or redirects.""" + DB = connect() -DB.sql(""" - describe stories -""") + urls = DB.query(""" + select + id + ,url + from stories + order by published_at asc + limit 5 + """).fetchall() -sns.histplot(x=hist['cnt']) -plt.show() + DB.close() + + story_id, url = urls[1] + # url + responses = [] + for story_id, url in urls: + out = {'story_id' : story_id, 'final_url' : url, 'timeout' : 0, 'status_code' : 200, 'content_length' : 0} + try: + response = requests.get(url, verify=False, timeout=10) + if len(response.history) > 1: + out['redirect'] = 1 + if url != response.url: + out['final_url'] = response.url + out['status_code'] = response.status_code + out['content_length'] = len(response.content) + except requests.exceptions.ReadTimeout as e: + print(f"timeout: {url}") + out['timeout'] = 1 + responses.append(out) + + sns.histplot(x=hist['cnt']) + plt.show() diff --git a/src/cli.py b/src/cli.py index 3e1464d..4391de0 100644 --- a/src/cli.py +++ b/src/cli.py @@ -62,4 +62,21 @@ def plot(name: PlotName, output: Path): plt.savefig(output) if __name__ == "__main__": + import scrape + cli.add_command(scrape.download) + cli.add_command(scrape.parse) + cli.add_command(scrape.load) + cli.add_command(scrape.normalize) + import word + # cli.add_command(word.distance) + # cli.add_command(word.train) + cli.add_command(word.embed) + cli.add_command(word.max_sequence) + import bias + cli.add_command(bias.parse) + cli.add_command(bias.load) + cli.add_command(bias.normalize) + # import mine + # cli.add_command(mine.embeddings) + # cli.add_command(mine.cluster) cli() diff --git a/src/mine.py b/src/mine.py new file mode 100644 index 0000000..3bc74dd --- /dev/null +++ b/src/mine.py @@ -0,0 +1,99 @@ +from data import data_dir, connect +import numpy as np +import sklearn +from sklearn.cluster import MiniBatchKMeans + + +@click.command(name="mine:embeddings") +def embeddings(): + data = np.load(data_dir() / "embeddings.npy") + kmeans = MiniBatchKMeans(n_clusters=5, + random_state=0, + batch_size=6, + n_init="auto") + model = kmeans.fit(data) + clusters = model.predict(data) + + db = connect() + + stories = db.sql(""" + select + id + from stories + order by id desc + """).df() + stories['cluster'] = clusters + + db.execute("drop table clusters") + db.execute("CREATE TABLE clusters (story_id BIGINT, cluster integer)") + db.append("clusters", stories) + +@click.command(name="mine:cluster") +def cluster(): + + import pandas as pd + pd.set_option('display.max_rows', 100) + pd.set_option('display.max_columns', 500) + pd.set_option('display.width', 1000) + df = db.sql(""" + select + s.publisher + ,c.cluster + ,count(1) as total + from clusters c + join stories s + on s.id = c.story_id + group by + s.publisher, c.cluster + """).df() + + pivoted = df.pivot(index='publisher', columns='cluster', values='total').sort_values([0], ascending=False) + pivoted[:25] + + db.sql(""" + select + publisher + ,title + from clusters c + join stories s + on s.id = c.story_id + where c.cluster = 0 + """) + + len(stories) + data.shape + +def main(): + db.sql(""" + select + count(distinct publisher) + from stories + """) + + db.sql(""" + with cte as ( + select + distinct title + from stories + ) + select + max(length(title)) as max + ,min(length(title)) as min + ,avg(length(title)) as avg + ,sum(length(title)) as characters + from cte + """).fetchall() + """ + let's calculate the size of the word embeddings stored as a list in the database + db.sql(""" + with cte as ( + select + distinct title + from stories + ) + + db.sql(""" + select + count(distinct url) + from stories + """) diff --git a/src/scrape.py b/src/scrape.py index 8950b25..66e96d0 100644 --- a/src/scrape.py +++ b/src/scrape.py @@ -8,36 +8,33 @@ from data import data_dir, connect from lxml import etree import pandas as pd -@click.group() -def cli(): - ... - -@cli.command() -@click.option('--directory', type=Path, default=data_dir()) -@click.option('--database', type=Path, default=data_dir() / "stories.duckdb") +@click.command(name='scrape:load') +@click.option('--directory', type=Path, default=data_dir(), show_default=True) +@click.option('--database', type=Path, default=data_dir() / "stories.duckdb", show_default=True) def load(directory, database): stories = directory / "stories.csv" related = directory / "related.csv" db = connect() db.sql(f""" - CREATE TABLE stories AS + CREATE OR REPLACE TABLE stories AS SELECT * FROM read_csv_auto('{stories}') """) db.sql(f""" - CREATE TABLE related_stories AS + CREATE OR REPLACE TABLE related_stories AS SELECT * FROM read_csv_auto('{related}') """) db.close() -@cli.command() -@click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum") +@click.command(name='scrape:download') +@click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum", show_default=True) def download(output_dir): + """download every day from 01/10/2005 to today from memeorandum.com""" day = timedelta(days=1) cur = date(2005, 10, 1) end = date.today() @@ -58,7 +55,7 @@ def download(output_dir): f.write(r.text) -@cli.command() +@click.command(name='scrape:parse') @click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum", show_default=True) @click.option('-o', '--output_dir', type=Path, default=data_dir(), show_default=True) def parse(directory, output_dir): @@ -83,6 +80,7 @@ def parse(directory, output_dir): # item = items[0] for item in items: out = dict() + out['published_at'] = date citation = item.xpath('./cite') if not citation: continue @@ -122,23 +120,55 @@ def parse(directory, output_dir): another['parent_id'] = item_id others.append(another) df = pd.DataFrame(published) - df.to_csv(output_dir / 'stories_v2.csv', sep='|', index=False) + df.to_csv(output_dir / 'stories.csv', sep='|', index=False) df = pd.DataFrame(others) - df.to_csv(output_dir / 'related_v2.csv', sep='|', index=False) + df.to_csv(output_dir / 'related.csv', sep='|', index=False) -@cli.command() +@click.command(name='scrape:normalize') def normalize(): + """fix database after load. remove duplicates. create publishers.""" DB = connect() DB.sql(""" - create table publishers as - select - row_number() over(order by publisher) as id - ,publisher - ,publisher_url - from stories - group by publisher, publisher_url + DELETE FROM stories + WHERE id IN ( + WITH cte AS ( + SELECT + url + ,id + ,ROW_NUMBER() OVER(PARTITION BY url) AS url_ctn + ,ROW_NUMBER() OVER(PARTITION BY title) AS title_ctn + FROM stories + ) + SELECT + id + FROM cte + WHERE url_ctn > 1 + OR title_ctn > 1 + ) + """) + DB.sql(""" + CREATE OR REPLACE TABLE publishers AS + with cte as ( + SELECT + s.publisher + ,s.publisher_url + FROM stories s + GROUP BY + s.publisher + ,s.publisher_url + ), together AS ( + SELECT + COALESCE(cte.publisher, r.publisher) AS publisher + ,cte.publisher_url + FROM cte + FULL OUTER JOIN related_stories r + ON cte.publisher = r.publisher + ) + SELECT + ROW_NUMBER() OVER() as id + ,t.* + FROM together t + GROUP BY + publisher + ,publisher_url """) - - -if __name__ == "__main__": - cli() diff --git a/src/word.py b/src/word.py index b78e8ce..490e8db 100644 --- a/src/word.py +++ b/src/word.py @@ -6,12 +6,9 @@ from tqdm import tqdm import torch from pathlib import Path -@click.group() -def cli(): - ... - -@cli.command() +@click.command(name="word:max-sequence") def max_sequence(): + """calculate the maximum token length given the story titles""" db = connect() longest = db.sql(""" select @@ -20,16 +17,19 @@ def max_sequence(): order by length(title) desc limit 5000 """).df() + db.close() tokenizer = AutoTokenizer.from_pretrained("roberta-base") tokens = tokenizer(longest['title'].to_list()) print(f"{max([len(x) for x in tokens['input_ids']])}") -@cli.command() +@click.command(name="word:train") def train(): + """TODO""" table = from_db(Data.Titles) n_classes = 10 +@click.command(name="word:embed") @click.option('-c', '--chunks', type=int, default=5000, show_default=True) @click.option('--embedding_dest', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'sequence_embeddings.npy'), show_default=True) @click.option('--token_dest', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'sequence_tokens.npy'), show_default=True) @@ -50,6 +50,7 @@ def embed(chunks, embedding_dest, token_dest): from stories order by id desc """).df() + db.close() # normalize text table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') @@ -61,7 +62,6 @@ def embed(chunks, embedding_dest, token_dest): tokens = tokens.to(device) with torch.no_grad(): outputs = model(**tokens) - #outputs = outputs.to(torch.device('cpu')) return tokens.to(torch.device('cpu')), outputs.last_hidden_state.to(torch.device('cpu')) tokens = [] @@ -80,7 +80,7 @@ def embed(chunks, embedding_dest, token_dest): np.save(embedding_dest, embeddings) np.save(token_dest, tokens) -@cli.command() +@click.command(name="word:distance") def distance(): """TODO: measure distance between sequence embeddings""" distances = distance.cdist(classes, classes, 'euclidean') @@ -88,7 +88,3 @@ def distance(): min_index = (np.argmin(distances)) closest = np.unravel_index(min_index, distances.shape) distances.flatten().shape - - -if __name__ == "__main__": - cli()