better cli command definitions.
This commit is contained in:
parent
086d858c3b
commit
6dba519443
19
src/bias.py
19
src/bias.py
|
@ -6,10 +6,6 @@ from pathlib import Path
|
||||||
import os
|
import os
|
||||||
import csv
|
import csv
|
||||||
|
|
||||||
@click.group()
|
|
||||||
def cli() -> None:
|
|
||||||
...
|
|
||||||
|
|
||||||
def map(rating:str) -> int:
|
def map(rating:str) -> int:
|
||||||
mapping = {
|
mapping = {
|
||||||
'right' : 0,
|
'right' : 0,
|
||||||
|
@ -22,7 +18,7 @@ def map(rating:str) -> int:
|
||||||
return mapping[rating]
|
return mapping[rating]
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@click.command(name="bias:load")
|
||||||
def load() -> None:
|
def load() -> None:
|
||||||
DB = connect()
|
DB = connect()
|
||||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||||
|
@ -35,8 +31,8 @@ def load() -> None:
|
||||||
,b.*
|
,b.*
|
||||||
from read_csv_auto('{f}') b
|
from read_csv_auto('{f}') b
|
||||||
""")
|
""")
|
||||||
@cli.command()
|
@click.command(name="bias:normalize")
|
||||||
def join() -> None:
|
def normalize() -> None:
|
||||||
DB = connect()
|
DB = connect()
|
||||||
|
|
||||||
DB.sql("""
|
DB.sql("""
|
||||||
|
@ -101,7 +97,7 @@ def join() -> None:
|
||||||
where publisher ilike '%CNN%'
|
where publisher ilike '%CNN%'
|
||||||
""")
|
""")
|
||||||
|
|
||||||
@cli.command()
|
@click.command(name='bias:debug')
|
||||||
def debug() -> None:
|
def debug() -> None:
|
||||||
DB = connect()
|
DB = connect()
|
||||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||||
|
@ -145,8 +141,8 @@ def debug() -> None:
|
||||||
|
|
||||||
outlets
|
outlets
|
||||||
|
|
||||||
@cli.command()
|
@click.command(name='bias:parse')
|
||||||
def parse_html() -> None:
|
def parse() -> None:
|
||||||
"""parse the save html page of allslides.com bias ratings into a normalized csv file"""
|
"""parse the save html page of allslides.com bias ratings into a normalized csv file"""
|
||||||
DB = connect()
|
DB = connect()
|
||||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||||
|
@ -175,6 +171,3 @@ def parse_html() -> None:
|
||||||
ratings.append(rating)
|
ratings.append(rating)
|
||||||
df = pd.DataFrame(ratings)
|
df = pd.DataFrame(ratings)
|
||||||
df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
|
df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
cli()
|
|
||||||
|
|
|
@ -1,21 +1,43 @@
|
||||||
import requests
|
import requests
|
||||||
import click
|
|
||||||
from data import connect
|
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
import click
|
||||||
|
|
||||||
DB = connect()
|
from data import connect
|
||||||
|
|
||||||
DB.sql("""
|
@click.command(name="broken:crawl")
|
||||||
select
|
def crawl():
|
||||||
id
|
"""crawl story urls checking for link rot or redirects."""
|
||||||
,url
|
DB = connect()
|
||||||
from stories
|
|
||||||
""")
|
|
||||||
|
|
||||||
DB.sql("""
|
urls = DB.query("""
|
||||||
describe stories
|
select
|
||||||
""")
|
id
|
||||||
|
,url
|
||||||
|
from stories
|
||||||
|
order by published_at asc
|
||||||
|
limit 5
|
||||||
|
""").fetchall()
|
||||||
|
|
||||||
sns.histplot(x=hist['cnt'])
|
DB.close()
|
||||||
plt.show()
|
|
||||||
|
story_id, url = urls[1]
|
||||||
|
# url
|
||||||
|
responses = []
|
||||||
|
for story_id, url in urls:
|
||||||
|
out = {'story_id' : story_id, 'final_url' : url, 'timeout' : 0, 'status_code' : 200, 'content_length' : 0}
|
||||||
|
try:
|
||||||
|
response = requests.get(url, verify=False, timeout=10)
|
||||||
|
if len(response.history) > 1:
|
||||||
|
out['redirect'] = 1
|
||||||
|
if url != response.url:
|
||||||
|
out['final_url'] = response.url
|
||||||
|
out['status_code'] = response.status_code
|
||||||
|
out['content_length'] = len(response.content)
|
||||||
|
except requests.exceptions.ReadTimeout as e:
|
||||||
|
print(f"timeout: {url}")
|
||||||
|
out['timeout'] = 1
|
||||||
|
responses.append(out)
|
||||||
|
|
||||||
|
sns.histplot(x=hist['cnt'])
|
||||||
|
plt.show()
|
||||||
|
|
17
src/cli.py
17
src/cli.py
|
@ -62,4 +62,21 @@ def plot(name: PlotName, output: Path):
|
||||||
plt.savefig(output)
|
plt.savefig(output)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
import scrape
|
||||||
|
cli.add_command(scrape.download)
|
||||||
|
cli.add_command(scrape.parse)
|
||||||
|
cli.add_command(scrape.load)
|
||||||
|
cli.add_command(scrape.normalize)
|
||||||
|
import word
|
||||||
|
# cli.add_command(word.distance)
|
||||||
|
# cli.add_command(word.train)
|
||||||
|
cli.add_command(word.embed)
|
||||||
|
cli.add_command(word.max_sequence)
|
||||||
|
import bias
|
||||||
|
cli.add_command(bias.parse)
|
||||||
|
cli.add_command(bias.load)
|
||||||
|
cli.add_command(bias.normalize)
|
||||||
|
# import mine
|
||||||
|
# cli.add_command(mine.embeddings)
|
||||||
|
# cli.add_command(mine.cluster)
|
||||||
cli()
|
cli()
|
||||||
|
|
|
@ -0,0 +1,99 @@
|
||||||
|
from data import data_dir, connect
|
||||||
|
import numpy as np
|
||||||
|
import sklearn
|
||||||
|
from sklearn.cluster import MiniBatchKMeans
|
||||||
|
|
||||||
|
|
||||||
|
@click.command(name="mine:embeddings")
|
||||||
|
def embeddings():
|
||||||
|
data = np.load(data_dir() / "embeddings.npy")
|
||||||
|
kmeans = MiniBatchKMeans(n_clusters=5,
|
||||||
|
random_state=0,
|
||||||
|
batch_size=6,
|
||||||
|
n_init="auto")
|
||||||
|
model = kmeans.fit(data)
|
||||||
|
clusters = model.predict(data)
|
||||||
|
|
||||||
|
db = connect()
|
||||||
|
|
||||||
|
stories = db.sql("""
|
||||||
|
select
|
||||||
|
id
|
||||||
|
from stories
|
||||||
|
order by id desc
|
||||||
|
""").df()
|
||||||
|
stories['cluster'] = clusters
|
||||||
|
|
||||||
|
db.execute("drop table clusters")
|
||||||
|
db.execute("CREATE TABLE clusters (story_id BIGINT, cluster integer)")
|
||||||
|
db.append("clusters", stories)
|
||||||
|
|
||||||
|
@click.command(name="mine:cluster")
|
||||||
|
def cluster():
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
pd.set_option('display.max_rows', 100)
|
||||||
|
pd.set_option('display.max_columns', 500)
|
||||||
|
pd.set_option('display.width', 1000)
|
||||||
|
df = db.sql("""
|
||||||
|
select
|
||||||
|
s.publisher
|
||||||
|
,c.cluster
|
||||||
|
,count(1) as total
|
||||||
|
from clusters c
|
||||||
|
join stories s
|
||||||
|
on s.id = c.story_id
|
||||||
|
group by
|
||||||
|
s.publisher, c.cluster
|
||||||
|
""").df()
|
||||||
|
|
||||||
|
pivoted = df.pivot(index='publisher', columns='cluster', values='total').sort_values([0], ascending=False)
|
||||||
|
pivoted[:25]
|
||||||
|
|
||||||
|
db.sql("""
|
||||||
|
select
|
||||||
|
publisher
|
||||||
|
,title
|
||||||
|
from clusters c
|
||||||
|
join stories s
|
||||||
|
on s.id = c.story_id
|
||||||
|
where c.cluster = 0
|
||||||
|
""")
|
||||||
|
|
||||||
|
len(stories)
|
||||||
|
data.shape
|
||||||
|
|
||||||
|
def main():
|
||||||
|
db.sql("""
|
||||||
|
select
|
||||||
|
count(distinct publisher)
|
||||||
|
from stories
|
||||||
|
""")
|
||||||
|
|
||||||
|
db.sql("""
|
||||||
|
with cte as (
|
||||||
|
select
|
||||||
|
distinct title
|
||||||
|
from stories
|
||||||
|
)
|
||||||
|
select
|
||||||
|
max(length(title)) as max
|
||||||
|
,min(length(title)) as min
|
||||||
|
,avg(length(title)) as avg
|
||||||
|
,sum(length(title)) as characters
|
||||||
|
from cte
|
||||||
|
""").fetchall()
|
||||||
|
"""
|
||||||
|
let's calculate the size of the word embeddings stored as a list in the database
|
||||||
|
db.sql("""
|
||||||
|
with cte as (
|
||||||
|
select
|
||||||
|
distinct title
|
||||||
|
from stories
|
||||||
|
)
|
||||||
|
|
||||||
|
db.sql("""
|
||||||
|
select
|
||||||
|
count(distinct url)
|
||||||
|
from stories
|
||||||
|
""")
|
|
@ -8,36 +8,33 @@ from data import data_dir, connect
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
@click.group()
|
@click.command(name='scrape:load')
|
||||||
def cli():
|
@click.option('--directory', type=Path, default=data_dir(), show_default=True)
|
||||||
...
|
@click.option('--database', type=Path, default=data_dir() / "stories.duckdb", show_default=True)
|
||||||
|
|
||||||
@cli.command()
|
|
||||||
@click.option('--directory', type=Path, default=data_dir())
|
|
||||||
@click.option('--database', type=Path, default=data_dir() / "stories.duckdb")
|
|
||||||
def load(directory, database):
|
def load(directory, database):
|
||||||
stories = directory / "stories.csv"
|
stories = directory / "stories.csv"
|
||||||
related = directory / "related.csv"
|
related = directory / "related.csv"
|
||||||
db = connect()
|
db = connect()
|
||||||
|
|
||||||
db.sql(f"""
|
db.sql(f"""
|
||||||
CREATE TABLE stories AS
|
CREATE OR REPLACE TABLE stories AS
|
||||||
SELECT
|
SELECT
|
||||||
*
|
*
|
||||||
FROM read_csv_auto('{stories}')
|
FROM read_csv_auto('{stories}')
|
||||||
""")
|
""")
|
||||||
|
|
||||||
db.sql(f"""
|
db.sql(f"""
|
||||||
CREATE TABLE related_stories AS
|
CREATE OR REPLACE TABLE related_stories AS
|
||||||
SELECT
|
SELECT
|
||||||
*
|
*
|
||||||
FROM read_csv_auto('{related}')
|
FROM read_csv_auto('{related}')
|
||||||
""")
|
""")
|
||||||
db.close()
|
db.close()
|
||||||
|
|
||||||
@cli.command()
|
@click.command(name='scrape:download')
|
||||||
@click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum")
|
@click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum", show_default=True)
|
||||||
def download(output_dir):
|
def download(output_dir):
|
||||||
|
"""download every day from 01/10/2005 to today from memeorandum.com"""
|
||||||
day = timedelta(days=1)
|
day = timedelta(days=1)
|
||||||
cur = date(2005, 10, 1)
|
cur = date(2005, 10, 1)
|
||||||
end = date.today()
|
end = date.today()
|
||||||
|
@ -58,7 +55,7 @@ def download(output_dir):
|
||||||
f.write(r.text)
|
f.write(r.text)
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@click.command(name='scrape:parse')
|
||||||
@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum", show_default=True)
|
@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum", show_default=True)
|
||||||
@click.option('-o', '--output_dir', type=Path, default=data_dir(), show_default=True)
|
@click.option('-o', '--output_dir', type=Path, default=data_dir(), show_default=True)
|
||||||
def parse(directory, output_dir):
|
def parse(directory, output_dir):
|
||||||
|
@ -83,6 +80,7 @@ def parse(directory, output_dir):
|
||||||
# item = items[0]
|
# item = items[0]
|
||||||
for item in items:
|
for item in items:
|
||||||
out = dict()
|
out = dict()
|
||||||
|
out['published_at'] = date
|
||||||
citation = item.xpath('./cite')
|
citation = item.xpath('./cite')
|
||||||
if not citation:
|
if not citation:
|
||||||
continue
|
continue
|
||||||
|
@ -122,23 +120,55 @@ def parse(directory, output_dir):
|
||||||
another['parent_id'] = item_id
|
another['parent_id'] = item_id
|
||||||
others.append(another)
|
others.append(another)
|
||||||
df = pd.DataFrame(published)
|
df = pd.DataFrame(published)
|
||||||
df.to_csv(output_dir / 'stories_v2.csv', sep='|', index=False)
|
df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
|
||||||
df = pd.DataFrame(others)
|
df = pd.DataFrame(others)
|
||||||
df.to_csv(output_dir / 'related_v2.csv', sep='|', index=False)
|
df.to_csv(output_dir / 'related.csv', sep='|', index=False)
|
||||||
|
|
||||||
@cli.command()
|
@click.command(name='scrape:normalize')
|
||||||
def normalize():
|
def normalize():
|
||||||
|
"""fix database after load. remove duplicates. create publishers."""
|
||||||
DB = connect()
|
DB = connect()
|
||||||
DB.sql("""
|
DB.sql("""
|
||||||
create table publishers as
|
DELETE FROM stories
|
||||||
select
|
WHERE id IN (
|
||||||
row_number() over(order by publisher) as id
|
WITH cte AS (
|
||||||
,publisher
|
SELECT
|
||||||
,publisher_url
|
url
|
||||||
from stories
|
,id
|
||||||
group by publisher, publisher_url
|
,ROW_NUMBER() OVER(PARTITION BY url) AS url_ctn
|
||||||
|
,ROW_NUMBER() OVER(PARTITION BY title) AS title_ctn
|
||||||
|
FROM stories
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
id
|
||||||
|
FROM cte
|
||||||
|
WHERE url_ctn > 1
|
||||||
|
OR title_ctn > 1
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
DB.sql("""
|
||||||
|
CREATE OR REPLACE TABLE publishers AS
|
||||||
|
with cte as (
|
||||||
|
SELECT
|
||||||
|
s.publisher
|
||||||
|
,s.publisher_url
|
||||||
|
FROM stories s
|
||||||
|
GROUP BY
|
||||||
|
s.publisher
|
||||||
|
,s.publisher_url
|
||||||
|
), together AS (
|
||||||
|
SELECT
|
||||||
|
COALESCE(cte.publisher, r.publisher) AS publisher
|
||||||
|
,cte.publisher_url
|
||||||
|
FROM cte
|
||||||
|
FULL OUTER JOIN related_stories r
|
||||||
|
ON cte.publisher = r.publisher
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
ROW_NUMBER() OVER() as id
|
||||||
|
,t.*
|
||||||
|
FROM together t
|
||||||
|
GROUP BY
|
||||||
|
publisher
|
||||||
|
,publisher_url
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
cli()
|
|
||||||
|
|
20
src/word.py
20
src/word.py
|
@ -6,12 +6,9 @@ from tqdm import tqdm
|
||||||
import torch
|
import torch
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@click.group()
|
@click.command(name="word:max-sequence")
|
||||||
def cli():
|
|
||||||
...
|
|
||||||
|
|
||||||
@cli.command()
|
|
||||||
def max_sequence():
|
def max_sequence():
|
||||||
|
"""calculate the maximum token length given the story titles"""
|
||||||
db = connect()
|
db = connect()
|
||||||
longest = db.sql("""
|
longest = db.sql("""
|
||||||
select
|
select
|
||||||
|
@ -20,16 +17,19 @@ def max_sequence():
|
||||||
order by length(title) desc
|
order by length(title) desc
|
||||||
limit 5000
|
limit 5000
|
||||||
""").df()
|
""").df()
|
||||||
|
db.close()
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
||||||
tokens = tokenizer(longest['title'].to_list())
|
tokens = tokenizer(longest['title'].to_list())
|
||||||
print(f"{max([len(x) for x in tokens['input_ids']])}")
|
print(f"{max([len(x) for x in tokens['input_ids']])}")
|
||||||
|
|
||||||
@cli.command()
|
@click.command(name="word:train")
|
||||||
def train():
|
def train():
|
||||||
|
"""TODO"""
|
||||||
table = from_db(Data.Titles)
|
table = from_db(Data.Titles)
|
||||||
n_classes = 10
|
n_classes = 10
|
||||||
|
|
||||||
|
@click.command(name="word:embed")
|
||||||
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
|
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
|
||||||
@click.option('--embedding_dest', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'sequence_embeddings.npy'), show_default=True)
|
@click.option('--embedding_dest', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'sequence_embeddings.npy'), show_default=True)
|
||||||
@click.option('--token_dest', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'sequence_tokens.npy'), show_default=True)
|
@click.option('--token_dest', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'sequence_tokens.npy'), show_default=True)
|
||||||
|
@ -50,6 +50,7 @@ def embed(chunks, embedding_dest, token_dest):
|
||||||
from stories
|
from stories
|
||||||
order by id desc
|
order by id desc
|
||||||
""").df()
|
""").df()
|
||||||
|
db.close()
|
||||||
|
|
||||||
# normalize text
|
# normalize text
|
||||||
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
|
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
|
||||||
|
@ -61,7 +62,6 @@ def embed(chunks, embedding_dest, token_dest):
|
||||||
tokens = tokens.to(device)
|
tokens = tokens.to(device)
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
outputs = model(**tokens)
|
outputs = model(**tokens)
|
||||||
#outputs = outputs.to(torch.device('cpu'))
|
|
||||||
return tokens.to(torch.device('cpu')), outputs.last_hidden_state.to(torch.device('cpu'))
|
return tokens.to(torch.device('cpu')), outputs.last_hidden_state.to(torch.device('cpu'))
|
||||||
|
|
||||||
tokens = []
|
tokens = []
|
||||||
|
@ -80,7 +80,7 @@ def embed(chunks, embedding_dest, token_dest):
|
||||||
np.save(embedding_dest, embeddings)
|
np.save(embedding_dest, embeddings)
|
||||||
np.save(token_dest, tokens)
|
np.save(token_dest, tokens)
|
||||||
|
|
||||||
@cli.command()
|
@click.command(name="word:distance")
|
||||||
def distance():
|
def distance():
|
||||||
"""TODO: measure distance between sequence embeddings"""
|
"""TODO: measure distance between sequence embeddings"""
|
||||||
distances = distance.cdist(classes, classes, 'euclidean')
|
distances = distance.cdist(classes, classes, 'euclidean')
|
||||||
|
@ -88,7 +88,3 @@ def distance():
|
||||||
min_index = (np.argmin(distances))
|
min_index = (np.argmin(distances))
|
||||||
closest = np.unravel_index(min_index, distances.shape)
|
closest = np.unravel_index(min_index, distances.shape)
|
||||||
distances.flatten().shape
|
distances.flatten().shape
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
cli()
|
|
||||||
|
|
Loading…
Reference in New Issue