better cli command definitions.

This commit is contained in:
matt 2023-04-22 18:19:14 -07:00
parent 086d858c3b
commit 6dba519443
6 changed files with 222 additions and 65 deletions

View File

@ -6,10 +6,6 @@ from pathlib import Path
import os import os
import csv import csv
@click.group()
def cli() -> None:
...
def map(rating:str) -> int: def map(rating:str) -> int:
mapping = { mapping = {
'right' : 0, 'right' : 0,
@ -22,7 +18,7 @@ def map(rating:str) -> int:
return mapping[rating] return mapping[rating]
@cli.command() @click.command(name="bias:load")
def load() -> None: def load() -> None:
DB = connect() DB = connect()
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
@ -35,8 +31,8 @@ def load() -> None:
,b.* ,b.*
from read_csv_auto('{f}') b from read_csv_auto('{f}') b
""") """)
@cli.command() @click.command(name="bias:normalize")
def join() -> None: def normalize() -> None:
DB = connect() DB = connect()
DB.sql(""" DB.sql("""
@ -101,7 +97,7 @@ def join() -> None:
where publisher ilike '%CNN%' where publisher ilike '%CNN%'
""") """)
@cli.command() @click.command(name='bias:debug')
def debug() -> None: def debug() -> None:
DB = connect() DB = connect()
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
@ -145,8 +141,8 @@ def debug() -> None:
outlets outlets
@cli.command() @click.command(name='bias:parse')
def parse_html() -> None: def parse() -> None:
"""parse the save html page of allslides.com bias ratings into a normalized csv file""" """parse the save html page of allslides.com bias ratings into a normalized csv file"""
DB = connect() DB = connect()
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
@ -175,6 +171,3 @@ def parse_html() -> None:
ratings.append(rating) ratings.append(rating)
df = pd.DataFrame(ratings) df = pd.DataFrame(ratings)
df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC) df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
if __name__ == "__main__":
cli()

View File

@ -1,21 +1,43 @@
import requests import requests
import click
from data import connect
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import click
from data import connect
@click.command(name="broken:crawl")
def crawl():
"""crawl story urls checking for link rot or redirects."""
DB = connect() DB = connect()
DB.sql(""" urls = DB.query("""
select select
id id
,url ,url
from stories from stories
""") order by published_at asc
limit 5
""").fetchall()
DB.sql(""" DB.close()
describe stories
""") story_id, url = urls[1]
# url
responses = []
for story_id, url in urls:
out = {'story_id' : story_id, 'final_url' : url, 'timeout' : 0, 'status_code' : 200, 'content_length' : 0}
try:
response = requests.get(url, verify=False, timeout=10)
if len(response.history) > 1:
out['redirect'] = 1
if url != response.url:
out['final_url'] = response.url
out['status_code'] = response.status_code
out['content_length'] = len(response.content)
except requests.exceptions.ReadTimeout as e:
print(f"timeout: {url}")
out['timeout'] = 1
responses.append(out)
sns.histplot(x=hist['cnt']) sns.histplot(x=hist['cnt'])
plt.show() plt.show()

View File

@ -62,4 +62,21 @@ def plot(name: PlotName, output: Path):
plt.savefig(output) plt.savefig(output)
if __name__ == "__main__": if __name__ == "__main__":
import scrape
cli.add_command(scrape.download)
cli.add_command(scrape.parse)
cli.add_command(scrape.load)
cli.add_command(scrape.normalize)
import word
# cli.add_command(word.distance)
# cli.add_command(word.train)
cli.add_command(word.embed)
cli.add_command(word.max_sequence)
import bias
cli.add_command(bias.parse)
cli.add_command(bias.load)
cli.add_command(bias.normalize)
# import mine
# cli.add_command(mine.embeddings)
# cli.add_command(mine.cluster)
cli() cli()

99
src/mine.py Normal file
View File

@ -0,0 +1,99 @@
from data import data_dir, connect
import numpy as np
import sklearn
from sklearn.cluster import MiniBatchKMeans
@click.command(name="mine:embeddings")
def embeddings():
data = np.load(data_dir() / "embeddings.npy")
kmeans = MiniBatchKMeans(n_clusters=5,
random_state=0,
batch_size=6,
n_init="auto")
model = kmeans.fit(data)
clusters = model.predict(data)
db = connect()
stories = db.sql("""
select
id
from stories
order by id desc
""").df()
stories['cluster'] = clusters
db.execute("drop table clusters")
db.execute("CREATE TABLE clusters (story_id BIGINT, cluster integer)")
db.append("clusters", stories)
@click.command(name="mine:cluster")
def cluster():
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
df = db.sql("""
select
s.publisher
,c.cluster
,count(1) as total
from clusters c
join stories s
on s.id = c.story_id
group by
s.publisher, c.cluster
""").df()
pivoted = df.pivot(index='publisher', columns='cluster', values='total').sort_values([0], ascending=False)
pivoted[:25]
db.sql("""
select
publisher
,title
from clusters c
join stories s
on s.id = c.story_id
where c.cluster = 0
""")
len(stories)
data.shape
def main():
db.sql("""
select
count(distinct publisher)
from stories
""")
db.sql("""
with cte as (
select
distinct title
from stories
)
select
max(length(title)) as max
,min(length(title)) as min
,avg(length(title)) as avg
,sum(length(title)) as characters
from cte
""").fetchall()
"""
let's calculate the size of the word embeddings stored as a list in the database
db.sql("""
with cte as (
select
distinct title
from stories
)
db.sql("""
select
count(distinct url)
from stories
""")

View File

@ -8,36 +8,33 @@ from data import data_dir, connect
from lxml import etree from lxml import etree
import pandas as pd import pandas as pd
@click.group() @click.command(name='scrape:load')
def cli(): @click.option('--directory', type=Path, default=data_dir(), show_default=True)
... @click.option('--database', type=Path, default=data_dir() / "stories.duckdb", show_default=True)
@cli.command()
@click.option('--directory', type=Path, default=data_dir())
@click.option('--database', type=Path, default=data_dir() / "stories.duckdb")
def load(directory, database): def load(directory, database):
stories = directory / "stories.csv" stories = directory / "stories.csv"
related = directory / "related.csv" related = directory / "related.csv"
db = connect() db = connect()
db.sql(f""" db.sql(f"""
CREATE TABLE stories AS CREATE OR REPLACE TABLE stories AS
SELECT SELECT
* *
FROM read_csv_auto('{stories}') FROM read_csv_auto('{stories}')
""") """)
db.sql(f""" db.sql(f"""
CREATE TABLE related_stories AS CREATE OR REPLACE TABLE related_stories AS
SELECT SELECT
* *
FROM read_csv_auto('{related}') FROM read_csv_auto('{related}')
""") """)
db.close() db.close()
@cli.command() @click.command(name='scrape:download')
@click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum") @click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum", show_default=True)
def download(output_dir): def download(output_dir):
"""download every day from 01/10/2005 to today from memeorandum.com"""
day = timedelta(days=1) day = timedelta(days=1)
cur = date(2005, 10, 1) cur = date(2005, 10, 1)
end = date.today() end = date.today()
@ -58,7 +55,7 @@ def download(output_dir):
f.write(r.text) f.write(r.text)
@cli.command() @click.command(name='scrape:parse')
@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum", show_default=True) @click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum", show_default=True)
@click.option('-o', '--output_dir', type=Path, default=data_dir(), show_default=True) @click.option('-o', '--output_dir', type=Path, default=data_dir(), show_default=True)
def parse(directory, output_dir): def parse(directory, output_dir):
@ -83,6 +80,7 @@ def parse(directory, output_dir):
# item = items[0] # item = items[0]
for item in items: for item in items:
out = dict() out = dict()
out['published_at'] = date
citation = item.xpath('./cite') citation = item.xpath('./cite')
if not citation: if not citation:
continue continue
@ -122,23 +120,55 @@ def parse(directory, output_dir):
another['parent_id'] = item_id another['parent_id'] = item_id
others.append(another) others.append(another)
df = pd.DataFrame(published) df = pd.DataFrame(published)
df.to_csv(output_dir / 'stories_v2.csv', sep='|', index=False) df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
df = pd.DataFrame(others) df = pd.DataFrame(others)
df.to_csv(output_dir / 'related_v2.csv', sep='|', index=False) df.to_csv(output_dir / 'related.csv', sep='|', index=False)
@cli.command() @click.command(name='scrape:normalize')
def normalize(): def normalize():
"""fix database after load. remove duplicates. create publishers."""
DB = connect() DB = connect()
DB.sql(""" DB.sql("""
create table publishers as DELETE FROM stories
select WHERE id IN (
row_number() over(order by publisher) as id WITH cte AS (
,publisher SELECT
,publisher_url url
from stories ,id
group by publisher, publisher_url ,ROW_NUMBER() OVER(PARTITION BY url) AS url_ctn
,ROW_NUMBER() OVER(PARTITION BY title) AS title_ctn
FROM stories
)
SELECT
id
FROM cte
WHERE url_ctn > 1
OR title_ctn > 1
)
""")
DB.sql("""
CREATE OR REPLACE TABLE publishers AS
with cte as (
SELECT
s.publisher
,s.publisher_url
FROM stories s
GROUP BY
s.publisher
,s.publisher_url
), together AS (
SELECT
COALESCE(cte.publisher, r.publisher) AS publisher
,cte.publisher_url
FROM cte
FULL OUTER JOIN related_stories r
ON cte.publisher = r.publisher
)
SELECT
ROW_NUMBER() OVER() as id
,t.*
FROM together t
GROUP BY
publisher
,publisher_url
""") """)
if __name__ == "__main__":
cli()

View File

@ -6,12 +6,9 @@ from tqdm import tqdm
import torch import torch
from pathlib import Path from pathlib import Path
@click.group() @click.command(name="word:max-sequence")
def cli():
...
@cli.command()
def max_sequence(): def max_sequence():
"""calculate the maximum token length given the story titles"""
db = connect() db = connect()
longest = db.sql(""" longest = db.sql("""
select select
@ -20,16 +17,19 @@ def max_sequence():
order by length(title) desc order by length(title) desc
limit 5000 limit 5000
""").df() """).df()
db.close()
tokenizer = AutoTokenizer.from_pretrained("roberta-base") tokenizer = AutoTokenizer.from_pretrained("roberta-base")
tokens = tokenizer(longest['title'].to_list()) tokens = tokenizer(longest['title'].to_list())
print(f"{max([len(x) for x in tokens['input_ids']])}") print(f"{max([len(x) for x in tokens['input_ids']])}")
@cli.command() @click.command(name="word:train")
def train(): def train():
"""TODO"""
table = from_db(Data.Titles) table = from_db(Data.Titles)
n_classes = 10 n_classes = 10
@click.command(name="word:embed")
@click.option('-c', '--chunks', type=int, default=5000, show_default=True) @click.option('-c', '--chunks', type=int, default=5000, show_default=True)
@click.option('--embedding_dest', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'sequence_embeddings.npy'), show_default=True) @click.option('--embedding_dest', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'sequence_embeddings.npy'), show_default=True)
@click.option('--token_dest', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'sequence_tokens.npy'), show_default=True) @click.option('--token_dest', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'sequence_tokens.npy'), show_default=True)
@ -50,6 +50,7 @@ def embed(chunks, embedding_dest, token_dest):
from stories from stories
order by id desc order by id desc
""").df() """).df()
db.close()
# normalize text # normalize text
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
@ -61,7 +62,6 @@ def embed(chunks, embedding_dest, token_dest):
tokens = tokens.to(device) tokens = tokens.to(device)
with torch.no_grad(): with torch.no_grad():
outputs = model(**tokens) outputs = model(**tokens)
#outputs = outputs.to(torch.device('cpu'))
return tokens.to(torch.device('cpu')), outputs.last_hidden_state.to(torch.device('cpu')) return tokens.to(torch.device('cpu')), outputs.last_hidden_state.to(torch.device('cpu'))
tokens = [] tokens = []
@ -80,7 +80,7 @@ def embed(chunks, embedding_dest, token_dest):
np.save(embedding_dest, embeddings) np.save(embedding_dest, embeddings)
np.save(token_dest, tokens) np.save(token_dest, tokens)
@cli.command() @click.command(name="word:distance")
def distance(): def distance():
"""TODO: measure distance between sequence embeddings""" """TODO: measure distance between sequence embeddings"""
distances = distance.cdist(classes, classes, 'euclidean') distances = distance.cdist(classes, classes, 'euclidean')
@ -88,7 +88,3 @@ def distance():
min_index = (np.argmin(distances)) min_index = (np.argmin(distances))
closest = np.unravel_index(min_index, distances.shape) closest = np.unravel_index(min_index, distances.shape)
distances.flatten().shape distances.flatten().shape
if __name__ == "__main__":
cli()