add progress and better scraping.
This commit is contained in:
180
src/bias.py
Normal file
180
src/bias.py
Normal file
@@ -0,0 +1,180 @@
|
||||
import click
|
||||
from data import connect
|
||||
import pandas as pd
|
||||
from lxml import etree
|
||||
from pathlib import Path
|
||||
import os
|
||||
import csv
|
||||
|
||||
@click.group()
|
||||
def cli() -> None:
|
||||
...
|
||||
|
||||
def map(rating:str) -> int:
|
||||
mapping = {
|
||||
'right' : 0,
|
||||
'left-center' : 1,
|
||||
'center' : 2,
|
||||
'left' : 3,
|
||||
'allsides' : 4,
|
||||
'right-center' : 5
|
||||
}
|
||||
return mapping[rating]
|
||||
|
||||
|
||||
@cli.command()
|
||||
def load() -> None:
|
||||
DB = connect()
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
f = str(DATA_DIR / "bias_ratings.csv")
|
||||
|
||||
DB.sql(f"""
|
||||
create table bias_ratings as
|
||||
select
|
||||
row_number() over(order by b.publisher) as id
|
||||
,b.*
|
||||
from read_csv_auto('{f}') b
|
||||
""")
|
||||
@cli.command()
|
||||
def join() -> None:
|
||||
DB = connect()
|
||||
|
||||
DB.sql("""
|
||||
with cte as (
|
||||
select
|
||||
s.publisher
|
||||
,count(1) as stories
|
||||
from stories s
|
||||
group by s.publisher
|
||||
)
|
||||
select
|
||||
s.publisher
|
||||
,s.stories
|
||||
,b.publisher
|
||||
,b.bias
|
||||
from bias_ratings b
|
||||
join cte s
|
||||
on s.publisher = b.publisher
|
||||
order by
|
||||
stories desc
|
||||
limit 15
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
with cte as (
|
||||
select
|
||||
s.publisher
|
||||
,count(1) as stories
|
||||
from stories s
|
||||
group by s.publisher
|
||||
)
|
||||
select
|
||||
sum(stories)
|
||||
,avg(agree / disagree)
|
||||
from bias_ratings b
|
||||
join cte s
|
||||
on s.publisher = b.publisher
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
with cte as (
|
||||
select
|
||||
s.publisher
|
||||
,count(1) as stories
|
||||
from stories s
|
||||
group by s.publisher
|
||||
)
|
||||
select
|
||||
sum(s.stories) filter(where b.publisher is not null) as matched
|
||||
,sum(s.stories) filter(where b.publisher is null) as unmatched
|
||||
,cast(sum(s.stories) filter(where b.publisher is not null) as numeric)
|
||||
/ sum(s.stories) filter(where b.publisher is null) as precent_matched
|
||||
from bias_ratings b
|
||||
right join cte s
|
||||
on s.publisher = b.publisher
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
select
|
||||
*
|
||||
from bias_ratings
|
||||
where publisher ilike '%CNN%'
|
||||
""")
|
||||
|
||||
@cli.command()
|
||||
def debug() -> None:
|
||||
DB = connect()
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
f = str(DATA_DIR / "bias_ratings.csv")
|
||||
|
||||
DB.sql("""
|
||||
with cte as (
|
||||
select
|
||||
outlet
|
||||
,count(1) as stories
|
||||
from stories
|
||||
group by outlet
|
||||
)
|
||||
,total as (
|
||||
select
|
||||
sum(stories) as total
|
||||
from cte
|
||||
)
|
||||
select
|
||||
cte.outlet
|
||||
,cte.stories
|
||||
,bias.outlet
|
||||
,bias.lean
|
||||
,sum(100 * (cte.stories / cast(total.total as float))) over() as rep
|
||||
,total.total
|
||||
from cte
|
||||
join bias
|
||||
on jaro_winkler_similarity(bias.outlet, cte.outlet) > 0.9
|
||||
cross join total.total
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
select
|
||||
outlet
|
||||
,count(1) as stories
|
||||
from stories
|
||||
group by outlet
|
||||
order by count(1) desc
|
||||
limit 50
|
||||
""")
|
||||
|
||||
outlets
|
||||
|
||||
@cli.command()
|
||||
def parse_html() -> None:
|
||||
"""parse the save html page of allslides.com bias ratings into a normalized csv file"""
|
||||
DB = connect()
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
bias_html = DATA_DIR / 'allsides.html'
|
||||
|
||||
parser = etree.HTMLParser()
|
||||
tree = etree.parse(str(bias_html), parser)
|
||||
root = tree.getroot()
|
||||
rows = root.xpath('//table[contains(@class,"views-table")]/tbody/tr')
|
||||
|
||||
ratings = []
|
||||
for row in rows:
|
||||
rating = dict()
|
||||
publisher = row.xpath('./td[contains(@class, "source-title")]/a')[0].text
|
||||
rating['publisher'] = publisher
|
||||
|
||||
bias = row.xpath('./td[contains(@class, "views-field-field-bias-image")]/a')[0].get('href')
|
||||
bias = bias.split('/')[-1]
|
||||
rating['bias'] = bias
|
||||
|
||||
agree = row.xpath('.//span[contains(@class, "agree")]')[0].text
|
||||
disagree = row.xpath('.//span[contains(@class, "disagree")]')[0].text
|
||||
|
||||
rating['agree'] = int(agree)
|
||||
rating['disagree'] = int(disagree)
|
||||
ratings.append(rating)
|
||||
df = pd.DataFrame(ratings)
|
||||
df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
@@ -1,46 +0,0 @@
|
||||
import click
|
||||
import duckdb
|
||||
from data import connect
|
||||
import polars as ps
|
||||
|
||||
DB = connect()
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
bias = ps.read_csv(DATA_DIR / 'allsides_bias.csv', sep="|")
|
||||
|
||||
DB.sql("""
|
||||
with cte as (
|
||||
select
|
||||
outlet
|
||||
,count(1) as stories
|
||||
from stories
|
||||
group by outlet
|
||||
)
|
||||
,total as (
|
||||
select
|
||||
sum(stories) as total
|
||||
from cte
|
||||
)
|
||||
select
|
||||
cte.outlet
|
||||
,cte.stories
|
||||
,bias.outlet
|
||||
,bias.lean
|
||||
,sum(100 * (cte.stories / cast(total.total as float))) over() as rep
|
||||
,total.total
|
||||
from cte
|
||||
join bias
|
||||
on jaro_winkler_similarity(bias.outlet, cte.outlet) > 0.9
|
||||
cross join total.total
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
select
|
||||
outlet
|
||||
,count(1) as stories
|
||||
from stories
|
||||
group by outlet
|
||||
order by count(1) desc
|
||||
limit 50
|
||||
""")
|
||||
|
||||
outlets
|
||||
@@ -59,27 +59,28 @@ def download(output_dir):
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum")
|
||||
@click.option('-o', '--output_dir', type=Path, default=data_dir())
|
||||
@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum", show_default=True)
|
||||
@click.option('-o', '--output_dir', type=Path, default=data_dir(), show_default=True)
|
||||
def parse(directory, output_dir):
|
||||
"""parse the html files on disk into a structured csv format."""
|
||||
directory = data_dir() / "memeorandum"
|
||||
parser = etree.HTMLParser()
|
||||
pages = [f for f in directory.glob("*.html")]
|
||||
published = []
|
||||
others = []
|
||||
#page = pages[0]
|
||||
# page = pages[0]
|
||||
page_iter = tqdm(pages, postfix="starting")
|
||||
for page in page_iter:
|
||||
page_iter.set_postfix_str(f"{page}")
|
||||
date = datetime.datetime.strptime(page.stem, '%y-%m-%d')
|
||||
# tree = etree.parse(str(page), parser)
|
||||
tree = etree.parse(str(page), parser)
|
||||
root = tree.getroot()
|
||||
if not root:
|
||||
if root is None:
|
||||
print(f"error opening {page}")
|
||||
continue
|
||||
items = root.xpath("//div[contains(@class, 'item')]")
|
||||
|
||||
# item = items[0]
|
||||
for item in items:
|
||||
out = dict()
|
||||
citation = item.xpath('./cite')
|
||||
@@ -92,16 +93,24 @@ def parse(directory, output_dir):
|
||||
author = ''
|
||||
out['author'] = author
|
||||
try:
|
||||
url = citation[0].getchildren()[0].get('href')
|
||||
publisher_url = citation[0].getchildren()[0].get('href')
|
||||
publisher = citation[0].getchildren()[0].text
|
||||
except IndexError as e:
|
||||
print(f"error with citation url: {page}")
|
||||
out['publisher'] = publisher
|
||||
out['publisher_url'] = url
|
||||
out['publisher_url'] = publisher_url
|
||||
|
||||
title = item.xpath('.//strong/a')[0].text
|
||||
out['title'] = title
|
||||
item_id = hash((title,page.stem,url))
|
||||
|
||||
url = item.xpath('.//strong/a')[0].get('href')
|
||||
out['url'] = url
|
||||
|
||||
item_id = hash((page.stem, url))
|
||||
out['id'] = item_id
|
||||
|
||||
old_id = hash((title, page.stem, publisher_url))
|
||||
out['old_id'] = old_id
|
||||
published.append(out)
|
||||
|
||||
related = item.xpath(".//span[contains(@class, 'mls')]/a")
|
||||
@@ -113,9 +122,22 @@ def parse(directory, output_dir):
|
||||
another['parent_id'] = item_id
|
||||
others.append(another)
|
||||
df = pd.DataFrame(published)
|
||||
df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
|
||||
df.to_csv(output_dir / 'stories_v2.csv', sep='|', index=False)
|
||||
df = pd.DataFrame(others)
|
||||
df.to_csv(output_dir / 'related.csv', sep='|', index=False)
|
||||
df.to_csv(output_dir / 'related_v2.csv', sep='|', index=False)
|
||||
|
||||
@cli.command()
|
||||
def normalize():
|
||||
DB = connect()
|
||||
DB.sql("""
|
||||
create table publishers as
|
||||
select
|
||||
row_number() over(order by publisher) as id
|
||||
,publisher
|
||||
,publisher_url
|
||||
from stories
|
||||
group by publisher, publisher_url
|
||||
""")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
25
src/word.py
25
src/word.py
@@ -3,28 +3,49 @@ from scipy.spatial import distance
|
||||
from transformers import AutoTokenizer, RobertaModel
|
||||
import numpy as np
|
||||
from model import Model
|
||||
from data import Data, from_db
|
||||
from data import Data, from_db, connect
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
...
|
||||
|
||||
@cli.command()
|
||||
def max_sequence():
|
||||
db = connect()
|
||||
longest = db.sql("""
|
||||
select
|
||||
title
|
||||
from stories
|
||||
order by length(title) desc
|
||||
limit 5000
|
||||
""").df()
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
||||
tokens = tokenizer(longest['title'].to_list())
|
||||
print(f"{max([len(x) for x in tokens['input_ids']])}")
|
||||
|
||||
@cli.command()
|
||||
def train():
|
||||
table = from_db(Data.Titles)
|
||||
|
||||
|
||||
n_classes = 10
|
||||
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
||||
model = RobertaModel.from_pretrained("roberta-base")
|
||||
|
||||
def get_embeddings(titles):
|
||||
# create tokens, padding to max width
|
||||
tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt")
|
||||
tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=70, return_attention_mask = True, return_tensors = "pt")
|
||||
outputs = model(**tokens)
|
||||
return outputs.last_hidden_state[:, 0, :]
|
||||
|
||||
titles = table['title'].apply(str).to_list()[:10]
|
||||
get_embeddings(titles)
|
||||
|
||||
outputs.last_hidden_state[0][200:]
|
||||
outputs.values().shape
|
||||
model
|
||||
|
||||
# linear = torch.nn.Linear(model.config.hidden_size, n_classes)
|
||||
# act = torch.nn.Sigmoid()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user