add progress and better scraping.

This commit is contained in:
matt 2023-04-22 13:00:24 -07:00
parent 297aeec32d
commit d43ed4658a
7 changed files with 287 additions and 73 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
*.csv
*.swp
__pycache__
tmp.py

View File

@ -1,31 +1,39 @@
# Data Mining - CSCI 577
# Project Status Report I
# Project Status Report III
*2023-04-04*
*2023-04-18*
## Participants
Matt Jensen
## Overarching Purpose
Computer Science 477/577
Project Status Report III
Due: Tuesday, April 18
I hope to use a dataset of new articles to track the polarization of news over time.
I have a hypothesis that news has become more polarized superficially, but has actually converged into only two dominate views points.
I think there is a connection to be made to other statistics, like voting polarity in congress, or income inequality, or consolidation of media into the hands of the few.
## Tools
## Data Source
> The third project progress report should include a preliminary account of the existing software tools you will be using.
> Ideally, you obtain the software you will (probably) need and run it on sample files (or your real files), so make sure that you understand how they work.
> Do not wait verify that there are no hidden complications.
> The are many plausible sources for such software, including the following:
To test this thesis, I will crawl the archives of [memeorandum.com](https://www.memeorandum.com/) for news stories from 2006 onward.
I will grab the title, author, publisher, published date, url and related discussions and store it in a .csv.
The site also has a concept of references, where a main, popular story may be covered by other sources.
So there is a concept of link similarity that could be explored in this analysis too.
I will use the following suite of python tools to conduct my research:
## Techniques
- python
- pytorch
- scikit-learn
- duckdb
- requests
- pandas
- matplotlib
- seaborn
I am unsure of which technique specifically will work best, but I believe an unsupervised clustering algorithm will serve me well.
I think there is a way to test the ideal number of clusters should exist to minimize the error.
This could be a good proxy for how many 'viewpoints' are allowed in 'mainstream' news media.
## Purpose
> This progress should also provide a definitive description of your purpose and how you intend to conduct it.
> This should take the form of a detailed outline of the procedures you will undertake in exploring your dataset(s) and maximizing the knowledge that can be extracted from it.
\newpage
@ -103,3 +111,31 @@ Another goal is to look at the political alignment over time.
I will train a classifier to predict political bias based on the word embeddings as well.
There is a concept of the [Overton Window](https://en.wikipedia.org/wiki/Overton_window) and I would be curious to know if title of new articles could be a proxy for the location of the overton window over time.
\newpage
# Project Status Report I
*2023-04-04*
## Participants
Matt Jensen
## Overarching Purpose
I hope to use a dataset of new articles to track the polarization of news over time.
I have a hypothesis that news has become more polarized superficially, but has actually converged into only two dominate views points.
I think there is a connection to be made to other statistics, like voting polarity in congress, or income inequality, or consolidation of media into the hands of the few.
## Data Source
To test this thesis, I will crawl the archives of [memeorandum.com](https://www.memeorandum.com/) for news stories from 2006 onward.
I will grab the title, author, publisher, published date, url and related discussions and store it in a .csv.
The site also has a concept of references, where a main, popular story may be covered by other sources.
So there is a concept of link similarity that could be explored in this analysis too.
## Techniques
I am unsure of which technique specifically will work best, but I believe an unsupervised clustering algorithm will serve me well.
I think there is a way to test the ideal number of clusters should exist to minimize the error.
This could be a good proxy for how many 'viewpoints' are allowed in 'mainstream' news media.

BIN
docs/progress_spec_3.docx Normal file

Binary file not shown.

180
src/bias.py Normal file
View File

@ -0,0 +1,180 @@
import click
from data import connect
import pandas as pd
from lxml import etree
from pathlib import Path
import os
import csv
@click.group()
def cli() -> None:
...
def map(rating:str) -> int:
mapping = {
'right' : 0,
'left-center' : 1,
'center' : 2,
'left' : 3,
'allsides' : 4,
'right-center' : 5
}
return mapping[rating]
@cli.command()
def load() -> None:
DB = connect()
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
f = str(DATA_DIR / "bias_ratings.csv")
DB.sql(f"""
create table bias_ratings as
select
row_number() over(order by b.publisher) as id
,b.*
from read_csv_auto('{f}') b
""")
@cli.command()
def join() -> None:
DB = connect()
DB.sql("""
with cte as (
select
s.publisher
,count(1) as stories
from stories s
group by s.publisher
)
select
s.publisher
,s.stories
,b.publisher
,b.bias
from bias_ratings b
join cte s
on s.publisher = b.publisher
order by
stories desc
limit 15
""")
DB.sql("""
with cte as (
select
s.publisher
,count(1) as stories
from stories s
group by s.publisher
)
select
sum(stories)
,avg(agree / disagree)
from bias_ratings b
join cte s
on s.publisher = b.publisher
""")
DB.sql("""
with cte as (
select
s.publisher
,count(1) as stories
from stories s
group by s.publisher
)
select
sum(s.stories) filter(where b.publisher is not null) as matched
,sum(s.stories) filter(where b.publisher is null) as unmatched
,cast(sum(s.stories) filter(where b.publisher is not null) as numeric)
/ sum(s.stories) filter(where b.publisher is null) as precent_matched
from bias_ratings b
right join cte s
on s.publisher = b.publisher
""")
DB.sql("""
select
*
from bias_ratings
where publisher ilike '%CNN%'
""")
@cli.command()
def debug() -> None:
DB = connect()
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
f = str(DATA_DIR / "bias_ratings.csv")
DB.sql("""
with cte as (
select
outlet
,count(1) as stories
from stories
group by outlet
)
,total as (
select
sum(stories) as total
from cte
)
select
cte.outlet
,cte.stories
,bias.outlet
,bias.lean
,sum(100 * (cte.stories / cast(total.total as float))) over() as rep
,total.total
from cte
join bias
on jaro_winkler_similarity(bias.outlet, cte.outlet) > 0.9
cross join total.total
""")
DB.sql("""
select
outlet
,count(1) as stories
from stories
group by outlet
order by count(1) desc
limit 50
""")
outlets
@cli.command()
def parse_html() -> None:
"""parse the save html page of allslides.com bias ratings into a normalized csv file"""
DB = connect()
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
bias_html = DATA_DIR / 'allsides.html'
parser = etree.HTMLParser()
tree = etree.parse(str(bias_html), parser)
root = tree.getroot()
rows = root.xpath('//table[contains(@class,"views-table")]/tbody/tr')
ratings = []
for row in rows:
rating = dict()
publisher = row.xpath('./td[contains(@class, "source-title")]/a')[0].text
rating['publisher'] = publisher
bias = row.xpath('./td[contains(@class, "views-field-field-bias-image")]/a')[0].get('href')
bias = bias.split('/')[-1]
rating['bias'] = bias
agree = row.xpath('.//span[contains(@class, "agree")]')[0].text
disagree = row.xpath('.//span[contains(@class, "disagree")]')[0].text
rating['agree'] = int(agree)
rating['disagree'] = int(disagree)
ratings.append(rating)
df = pd.DataFrame(ratings)
df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
if __name__ == "__main__":
cli()

View File

@ -1,46 +0,0 @@
import click
import duckdb
from data import connect
import polars as ps
DB = connect()
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
bias = ps.read_csv(DATA_DIR / 'allsides_bias.csv', sep="|")
DB.sql("""
with cte as (
select
outlet
,count(1) as stories
from stories
group by outlet
)
,total as (
select
sum(stories) as total
from cte
)
select
cte.outlet
,cte.stories
,bias.outlet
,bias.lean
,sum(100 * (cte.stories / cast(total.total as float))) over() as rep
,total.total
from cte
join bias
on jaro_winkler_similarity(bias.outlet, cte.outlet) > 0.9
cross join total.total
""")
DB.sql("""
select
outlet
,count(1) as stories
from stories
group by outlet
order by count(1) desc
limit 50
""")
outlets

View File

@ -59,9 +59,10 @@ def download(output_dir):
@cli.command()
@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum")
@click.option('-o', '--output_dir', type=Path, default=data_dir())
@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum", show_default=True)
@click.option('-o', '--output_dir', type=Path, default=data_dir(), show_default=True)
def parse(directory, output_dir):
"""parse the html files on disk into a structured csv format."""
directory = data_dir() / "memeorandum"
parser = etree.HTMLParser()
pages = [f for f in directory.glob("*.html")]
@ -72,14 +73,14 @@ def parse(directory, output_dir):
for page in page_iter:
page_iter.set_postfix_str(f"{page}")
date = datetime.datetime.strptime(page.stem, '%y-%m-%d')
# tree = etree.parse(str(page), parser)
tree = etree.parse(str(page), parser)
root = tree.getroot()
if not root:
if root is None:
print(f"error opening {page}")
continue
items = root.xpath("//div[contains(@class, 'item')]")
# item = items[0]
for item in items:
out = dict()
citation = item.xpath('./cite')
@ -92,16 +93,24 @@ def parse(directory, output_dir):
author = ''
out['author'] = author
try:
url = citation[0].getchildren()[0].get('href')
publisher_url = citation[0].getchildren()[0].get('href')
publisher = citation[0].getchildren()[0].text
except IndexError as e:
print(f"error with citation url: {page}")
out['publisher'] = publisher
out['publisher_url'] = url
out['publisher_url'] = publisher_url
title = item.xpath('.//strong/a')[0].text
out['title'] = title
item_id = hash((title,page.stem,url))
url = item.xpath('.//strong/a')[0].get('href')
out['url'] = url
item_id = hash((page.stem, url))
out['id'] = item_id
old_id = hash((title, page.stem, publisher_url))
out['old_id'] = old_id
published.append(out)
related = item.xpath(".//span[contains(@class, 'mls')]/a")
@ -113,9 +122,22 @@ def parse(directory, output_dir):
another['parent_id'] = item_id
others.append(another)
df = pd.DataFrame(published)
df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
df.to_csv(output_dir / 'stories_v2.csv', sep='|', index=False)
df = pd.DataFrame(others)
df.to_csv(output_dir / 'related.csv', sep='|', index=False)
df.to_csv(output_dir / 'related_v2.csv', sep='|', index=False)
@cli.command()
def normalize():
DB = connect()
DB.sql("""
create table publishers as
select
row_number() over(order by publisher) as id
,publisher
,publisher_url
from stories
group by publisher, publisher_url
""")
if __name__ == "__main__":

View File

@ -3,28 +3,49 @@ from scipy.spatial import distance
from transformers import AutoTokenizer, RobertaModel
import numpy as np
from model import Model
from data import Data, from_db
from data import Data, from_db, connect
@click.group()
def cli():
...
@cli.command()
def max_sequence():
db = connect()
longest = db.sql("""
select
title
from stories
order by length(title) desc
limit 5000
""").df()
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
tokens = tokenizer(longest['title'].to_list())
print(f"{max([len(x) for x in tokens['input_ids']])}")
@cli.command()
def train():
table = from_db(Data.Titles)
n_classes = 10
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")
def get_embeddings(titles):
# create tokens, padding to max width
tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt")
tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=70, return_attention_mask = True, return_tensors = "pt")
outputs = model(**tokens)
return outputs.last_hidden_state[:, 0, :]
titles = table['title'].apply(str).to_list()[:10]
get_embeddings(titles)
outputs.last_hidden_state[0][200:]
outputs.values().shape
model
# linear = torch.nn.Linear(model.config.hidden_size, n_classes)
# act = torch.nn.Sigmoid()