better cli command definitions.

This commit is contained in:
matt 2023-04-22 18:19:14 -07:00
parent 086d858c3b
commit 6dba519443
6 changed files with 222 additions and 65 deletions

View File

@ -6,10 +6,6 @@ from pathlib import Path
import os
import csv
@click.group()
def cli() -> None:
...
def map(rating:str) -> int:
mapping = {
'right' : 0,
@ -22,7 +18,7 @@ def map(rating:str) -> int:
return mapping[rating]
@cli.command()
@click.command(name="bias:load")
def load() -> None:
DB = connect()
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
@ -35,8 +31,8 @@ def load() -> None:
,b.*
from read_csv_auto('{f}') b
""")
@cli.command()
def join() -> None:
@click.command(name="bias:normalize")
def normalize() -> None:
DB = connect()
DB.sql("""
@ -101,7 +97,7 @@ def join() -> None:
where publisher ilike '%CNN%'
""")
@cli.command()
@click.command(name='bias:debug')
def debug() -> None:
DB = connect()
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
@ -145,8 +141,8 @@ def debug() -> None:
outlets
@cli.command()
def parse_html() -> None:
@click.command(name='bias:parse')
def parse() -> None:
"""parse the save html page of allslides.com bias ratings into a normalized csv file"""
DB = connect()
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
@ -175,6 +171,3 @@ def parse_html() -> None:
ratings.append(rating)
df = pd.DataFrame(ratings)
df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
if __name__ == "__main__":
cli()

View File

@ -1,21 +1,43 @@
import requests
import click
from data import connect
import seaborn as sns
import matplotlib.pyplot as plt
import click
from data import connect
@click.command(name="broken:crawl")
def crawl():
"""crawl story urls checking for link rot or redirects."""
DB = connect()
DB.sql("""
urls = DB.query("""
select
id
,url
from stories
""")
order by published_at asc
limit 5
""").fetchall()
DB.sql("""
describe stories
""")
DB.close()
story_id, url = urls[1]
# url
responses = []
for story_id, url in urls:
out = {'story_id' : story_id, 'final_url' : url, 'timeout' : 0, 'status_code' : 200, 'content_length' : 0}
try:
response = requests.get(url, verify=False, timeout=10)
if len(response.history) > 1:
out['redirect'] = 1
if url != response.url:
out['final_url'] = response.url
out['status_code'] = response.status_code
out['content_length'] = len(response.content)
except requests.exceptions.ReadTimeout as e:
print(f"timeout: {url}")
out['timeout'] = 1
responses.append(out)
sns.histplot(x=hist['cnt'])
plt.show()

View File

@ -62,4 +62,21 @@ def plot(name: PlotName, output: Path):
plt.savefig(output)
if __name__ == "__main__":
import scrape
cli.add_command(scrape.download)
cli.add_command(scrape.parse)
cli.add_command(scrape.load)
cli.add_command(scrape.normalize)
import word
# cli.add_command(word.distance)
# cli.add_command(word.train)
cli.add_command(word.embed)
cli.add_command(word.max_sequence)
import bias
cli.add_command(bias.parse)
cli.add_command(bias.load)
cli.add_command(bias.normalize)
# import mine
# cli.add_command(mine.embeddings)
# cli.add_command(mine.cluster)
cli()

99
src/mine.py Normal file
View File

@ -0,0 +1,99 @@
from data import data_dir, connect
import numpy as np
import sklearn
from sklearn.cluster import MiniBatchKMeans
@click.command(name="mine:embeddings")
def embeddings():
data = np.load(data_dir() / "embeddings.npy")
kmeans = MiniBatchKMeans(n_clusters=5,
random_state=0,
batch_size=6,
n_init="auto")
model = kmeans.fit(data)
clusters = model.predict(data)
db = connect()
stories = db.sql("""
select
id
from stories
order by id desc
""").df()
stories['cluster'] = clusters
db.execute("drop table clusters")
db.execute("CREATE TABLE clusters (story_id BIGINT, cluster integer)")
db.append("clusters", stories)
@click.command(name="mine:cluster")
def cluster():
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
df = db.sql("""
select
s.publisher
,c.cluster
,count(1) as total
from clusters c
join stories s
on s.id = c.story_id
group by
s.publisher, c.cluster
""").df()
pivoted = df.pivot(index='publisher', columns='cluster', values='total').sort_values([0], ascending=False)
pivoted[:25]
db.sql("""
select
publisher
,title
from clusters c
join stories s
on s.id = c.story_id
where c.cluster = 0
""")
len(stories)
data.shape
def main():
db.sql("""
select
count(distinct publisher)
from stories
""")
db.sql("""
with cte as (
select
distinct title
from stories
)
select
max(length(title)) as max
,min(length(title)) as min
,avg(length(title)) as avg
,sum(length(title)) as characters
from cte
""").fetchall()
"""
let's calculate the size of the word embeddings stored as a list in the database
db.sql("""
with cte as (
select
distinct title
from stories
)
db.sql("""
select
count(distinct url)
from stories
""")

View File

@ -8,36 +8,33 @@ from data import data_dir, connect
from lxml import etree
import pandas as pd
@click.group()
def cli():
...
@cli.command()
@click.option('--directory', type=Path, default=data_dir())
@click.option('--database', type=Path, default=data_dir() / "stories.duckdb")
@click.command(name='scrape:load')
@click.option('--directory', type=Path, default=data_dir(), show_default=True)
@click.option('--database', type=Path, default=data_dir() / "stories.duckdb", show_default=True)
def load(directory, database):
stories = directory / "stories.csv"
related = directory / "related.csv"
db = connect()
db.sql(f"""
CREATE TABLE stories AS
CREATE OR REPLACE TABLE stories AS
SELECT
*
FROM read_csv_auto('{stories}')
""")
db.sql(f"""
CREATE TABLE related_stories AS
CREATE OR REPLACE TABLE related_stories AS
SELECT
*
FROM read_csv_auto('{related}')
""")
db.close()
@cli.command()
@click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum")
@click.command(name='scrape:download')
@click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum", show_default=True)
def download(output_dir):
"""download every day from 01/10/2005 to today from memeorandum.com"""
day = timedelta(days=1)
cur = date(2005, 10, 1)
end = date.today()
@ -58,7 +55,7 @@ def download(output_dir):
f.write(r.text)
@cli.command()
@click.command(name='scrape:parse')
@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum", show_default=True)
@click.option('-o', '--output_dir', type=Path, default=data_dir(), show_default=True)
def parse(directory, output_dir):
@ -83,6 +80,7 @@ def parse(directory, output_dir):
# item = items[0]
for item in items:
out = dict()
out['published_at'] = date
citation = item.xpath('./cite')
if not citation:
continue
@ -122,23 +120,55 @@ def parse(directory, output_dir):
another['parent_id'] = item_id
others.append(another)
df = pd.DataFrame(published)
df.to_csv(output_dir / 'stories_v2.csv', sep='|', index=False)
df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
df = pd.DataFrame(others)
df.to_csv(output_dir / 'related_v2.csv', sep='|', index=False)
df.to_csv(output_dir / 'related.csv', sep='|', index=False)
@cli.command()
@click.command(name='scrape:normalize')
def normalize():
"""fix database after load. remove duplicates. create publishers."""
DB = connect()
DB.sql("""
create table publishers as
select
row_number() over(order by publisher) as id
,publisher
,publisher_url
from stories
group by publisher, publisher_url
DELETE FROM stories
WHERE id IN (
WITH cte AS (
SELECT
url
,id
,ROW_NUMBER() OVER(PARTITION BY url) AS url_ctn
,ROW_NUMBER() OVER(PARTITION BY title) AS title_ctn
FROM stories
)
SELECT
id
FROM cte
WHERE url_ctn > 1
OR title_ctn > 1
)
""")
DB.sql("""
CREATE OR REPLACE TABLE publishers AS
with cte as (
SELECT
s.publisher
,s.publisher_url
FROM stories s
GROUP BY
s.publisher
,s.publisher_url
), together AS (
SELECT
COALESCE(cte.publisher, r.publisher) AS publisher
,cte.publisher_url
FROM cte
FULL OUTER JOIN related_stories r
ON cte.publisher = r.publisher
)
SELECT
ROW_NUMBER() OVER() as id
,t.*
FROM together t
GROUP BY
publisher
,publisher_url
""")
if __name__ == "__main__":
cli()

View File

@ -6,12 +6,9 @@ from tqdm import tqdm
import torch
from pathlib import Path
@click.group()
def cli():
...
@cli.command()
@click.command(name="word:max-sequence")
def max_sequence():
"""calculate the maximum token length given the story titles"""
db = connect()
longest = db.sql("""
select
@ -20,16 +17,19 @@ def max_sequence():
order by length(title) desc
limit 5000
""").df()
db.close()
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
tokens = tokenizer(longest['title'].to_list())
print(f"{max([len(x) for x in tokens['input_ids']])}")
@cli.command()
@click.command(name="word:train")
def train():
"""TODO"""
table = from_db(Data.Titles)
n_classes = 10
@click.command(name="word:embed")
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
@click.option('--embedding_dest', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'sequence_embeddings.npy'), show_default=True)
@click.option('--token_dest', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'sequence_tokens.npy'), show_default=True)
@ -50,6 +50,7 @@ def embed(chunks, embedding_dest, token_dest):
from stories
order by id desc
""").df()
db.close()
# normalize text
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
@ -61,7 +62,6 @@ def embed(chunks, embedding_dest, token_dest):
tokens = tokens.to(device)
with torch.no_grad():
outputs = model(**tokens)
#outputs = outputs.to(torch.device('cpu'))
return tokens.to(torch.device('cpu')), outputs.last_hidden_state.to(torch.device('cpu'))
tokens = []
@ -80,7 +80,7 @@ def embed(chunks, embedding_dest, token_dest):
np.save(embedding_dest, embeddings)
np.save(token_dest, tokens)
@cli.command()
@click.command(name="word:distance")
def distance():
"""TODO: measure distance between sequence embeddings"""
distances = distance.cdist(classes, classes, 'euclidean')
@ -88,7 +88,3 @@ def distance():
min_index = (np.argmin(distances))
closest = np.unravel_index(min_index, distances.shape)
distances.flatten().shape
if __name__ == "__main__":
cli()