add progress and better scraping.
This commit is contained in:
parent
297aeec32d
commit
d43ed4658a
|
@ -1,3 +1,4 @@
|
|||
*.csv
|
||||
*.swp
|
||||
__pycache__
|
||||
tmp.py
|
||||
|
|
|
@ -1,31 +1,39 @@
|
|||
# Data Mining - CSCI 577
|
||||
|
||||
# Project Status Report I
|
||||
# Project Status Report III
|
||||
|
||||
*2023-04-04*
|
||||
*2023-04-18*
|
||||
|
||||
## Participants
|
||||
|
||||
Matt Jensen
|
||||
|
||||
## Overarching Purpose
|
||||
Computer Science 477/577
|
||||
Project Status Report III
|
||||
Due: Tuesday, April 18
|
||||
|
||||
I hope to use a dataset of new articles to track the polarization of news over time.
|
||||
I have a hypothesis that news has become more polarized superficially, but has actually converged into only two dominate views points.
|
||||
I think there is a connection to be made to other statistics, like voting polarity in congress, or income inequality, or consolidation of media into the hands of the few.
|
||||
## Tools
|
||||
|
||||
## Data Source
|
||||
> The third project progress report should include a preliminary account of the existing software tools you will be using.
|
||||
> Ideally, you obtain the software you will (probably) need and run it on sample files (or your real files), so make sure that you understand how they work.
|
||||
> Do not wait verify that there are no hidden complications.
|
||||
> The are many plausible sources for such software, including the following:
|
||||
|
||||
To test this thesis, I will crawl the archives of [memeorandum.com](https://www.memeorandum.com/) for news stories from 2006 onward.
|
||||
I will grab the title, author, publisher, published date, url and related discussions and store it in a .csv.
|
||||
The site also has a concept of references, where a main, popular story may be covered by other sources.
|
||||
So there is a concept of link similarity that could be explored in this analysis too.
|
||||
I will use the following suite of python tools to conduct my research:
|
||||
|
||||
## Techniques
|
||||
- python
|
||||
- pytorch
|
||||
- scikit-learn
|
||||
- duckdb
|
||||
- requests
|
||||
- pandas
|
||||
- matplotlib
|
||||
- seaborn
|
||||
|
||||
I am unsure of which technique specifically will work best, but I believe an unsupervised clustering algorithm will serve me well.
|
||||
I think there is a way to test the ideal number of clusters should exist to minimize the error.
|
||||
This could be a good proxy for how many 'viewpoints' are allowed in 'mainstream' news media.
|
||||
## Purpose
|
||||
|
||||
> This progress should also provide a definitive description of your purpose and how you intend to conduct it.
|
||||
> This should take the form of a detailed outline of the procedures you will undertake in exploring your dataset(s) and maximizing the knowledge that can be extracted from it.
|
||||
|
||||
\newpage
|
||||
|
||||
|
@ -103,3 +111,31 @@ Another goal is to look at the political alignment over time.
|
|||
I will train a classifier to predict political bias based on the word embeddings as well.
|
||||
There is a concept of the [Overton Window](https://en.wikipedia.org/wiki/Overton_window) and I would be curious to know if title of new articles could be a proxy for the location of the overton window over time.
|
||||
|
||||
\newpage
|
||||
|
||||
# Project Status Report I
|
||||
|
||||
*2023-04-04*
|
||||
|
||||
## Participants
|
||||
|
||||
Matt Jensen
|
||||
|
||||
## Overarching Purpose
|
||||
|
||||
I hope to use a dataset of new articles to track the polarization of news over time.
|
||||
I have a hypothesis that news has become more polarized superficially, but has actually converged into only two dominate views points.
|
||||
I think there is a connection to be made to other statistics, like voting polarity in congress, or income inequality, or consolidation of media into the hands of the few.
|
||||
|
||||
## Data Source
|
||||
|
||||
To test this thesis, I will crawl the archives of [memeorandum.com](https://www.memeorandum.com/) for news stories from 2006 onward.
|
||||
I will grab the title, author, publisher, published date, url and related discussions and store it in a .csv.
|
||||
The site also has a concept of references, where a main, popular story may be covered by other sources.
|
||||
So there is a concept of link similarity that could be explored in this analysis too.
|
||||
|
||||
## Techniques
|
||||
|
||||
I am unsure of which technique specifically will work best, but I believe an unsupervised clustering algorithm will serve me well.
|
||||
I think there is a way to test the ideal number of clusters should exist to minimize the error.
|
||||
This could be a good proxy for how many 'viewpoints' are allowed in 'mainstream' news media.
|
||||
|
|
Binary file not shown.
|
@ -0,0 +1,180 @@
|
|||
import click
|
||||
from data import connect
|
||||
import pandas as pd
|
||||
from lxml import etree
|
||||
from pathlib import Path
|
||||
import os
|
||||
import csv
|
||||
|
||||
@click.group()
|
||||
def cli() -> None:
|
||||
...
|
||||
|
||||
def map(rating:str) -> int:
|
||||
mapping = {
|
||||
'right' : 0,
|
||||
'left-center' : 1,
|
||||
'center' : 2,
|
||||
'left' : 3,
|
||||
'allsides' : 4,
|
||||
'right-center' : 5
|
||||
}
|
||||
return mapping[rating]
|
||||
|
||||
|
||||
@cli.command()
|
||||
def load() -> None:
|
||||
DB = connect()
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
f = str(DATA_DIR / "bias_ratings.csv")
|
||||
|
||||
DB.sql(f"""
|
||||
create table bias_ratings as
|
||||
select
|
||||
row_number() over(order by b.publisher) as id
|
||||
,b.*
|
||||
from read_csv_auto('{f}') b
|
||||
""")
|
||||
@cli.command()
|
||||
def join() -> None:
|
||||
DB = connect()
|
||||
|
||||
DB.sql("""
|
||||
with cte as (
|
||||
select
|
||||
s.publisher
|
||||
,count(1) as stories
|
||||
from stories s
|
||||
group by s.publisher
|
||||
)
|
||||
select
|
||||
s.publisher
|
||||
,s.stories
|
||||
,b.publisher
|
||||
,b.bias
|
||||
from bias_ratings b
|
||||
join cte s
|
||||
on s.publisher = b.publisher
|
||||
order by
|
||||
stories desc
|
||||
limit 15
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
with cte as (
|
||||
select
|
||||
s.publisher
|
||||
,count(1) as stories
|
||||
from stories s
|
||||
group by s.publisher
|
||||
)
|
||||
select
|
||||
sum(stories)
|
||||
,avg(agree / disagree)
|
||||
from bias_ratings b
|
||||
join cte s
|
||||
on s.publisher = b.publisher
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
with cte as (
|
||||
select
|
||||
s.publisher
|
||||
,count(1) as stories
|
||||
from stories s
|
||||
group by s.publisher
|
||||
)
|
||||
select
|
||||
sum(s.stories) filter(where b.publisher is not null) as matched
|
||||
,sum(s.stories) filter(where b.publisher is null) as unmatched
|
||||
,cast(sum(s.stories) filter(where b.publisher is not null) as numeric)
|
||||
/ sum(s.stories) filter(where b.publisher is null) as precent_matched
|
||||
from bias_ratings b
|
||||
right join cte s
|
||||
on s.publisher = b.publisher
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
select
|
||||
*
|
||||
from bias_ratings
|
||||
where publisher ilike '%CNN%'
|
||||
""")
|
||||
|
||||
@cli.command()
|
||||
def debug() -> None:
|
||||
DB = connect()
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
f = str(DATA_DIR / "bias_ratings.csv")
|
||||
|
||||
DB.sql("""
|
||||
with cte as (
|
||||
select
|
||||
outlet
|
||||
,count(1) as stories
|
||||
from stories
|
||||
group by outlet
|
||||
)
|
||||
,total as (
|
||||
select
|
||||
sum(stories) as total
|
||||
from cte
|
||||
)
|
||||
select
|
||||
cte.outlet
|
||||
,cte.stories
|
||||
,bias.outlet
|
||||
,bias.lean
|
||||
,sum(100 * (cte.stories / cast(total.total as float))) over() as rep
|
||||
,total.total
|
||||
from cte
|
||||
join bias
|
||||
on jaro_winkler_similarity(bias.outlet, cte.outlet) > 0.9
|
||||
cross join total.total
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
select
|
||||
outlet
|
||||
,count(1) as stories
|
||||
from stories
|
||||
group by outlet
|
||||
order by count(1) desc
|
||||
limit 50
|
||||
""")
|
||||
|
||||
outlets
|
||||
|
||||
@cli.command()
|
||||
def parse_html() -> None:
|
||||
"""parse the save html page of allslides.com bias ratings into a normalized csv file"""
|
||||
DB = connect()
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
bias_html = DATA_DIR / 'allsides.html'
|
||||
|
||||
parser = etree.HTMLParser()
|
||||
tree = etree.parse(str(bias_html), parser)
|
||||
root = tree.getroot()
|
||||
rows = root.xpath('//table[contains(@class,"views-table")]/tbody/tr')
|
||||
|
||||
ratings = []
|
||||
for row in rows:
|
||||
rating = dict()
|
||||
publisher = row.xpath('./td[contains(@class, "source-title")]/a')[0].text
|
||||
rating['publisher'] = publisher
|
||||
|
||||
bias = row.xpath('./td[contains(@class, "views-field-field-bias-image")]/a')[0].get('href')
|
||||
bias = bias.split('/')[-1]
|
||||
rating['bias'] = bias
|
||||
|
||||
agree = row.xpath('.//span[contains(@class, "agree")]')[0].text
|
||||
disagree = row.xpath('.//span[contains(@class, "disagree")]')[0].text
|
||||
|
||||
rating['agree'] = int(agree)
|
||||
rating['disagree'] = int(disagree)
|
||||
ratings.append(rating)
|
||||
df = pd.DataFrame(ratings)
|
||||
df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
|
@ -1,46 +0,0 @@
|
|||
import click
|
||||
import duckdb
|
||||
from data import connect
|
||||
import polars as ps
|
||||
|
||||
DB = connect()
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
bias = ps.read_csv(DATA_DIR / 'allsides_bias.csv', sep="|")
|
||||
|
||||
DB.sql("""
|
||||
with cte as (
|
||||
select
|
||||
outlet
|
||||
,count(1) as stories
|
||||
from stories
|
||||
group by outlet
|
||||
)
|
||||
,total as (
|
||||
select
|
||||
sum(stories) as total
|
||||
from cte
|
||||
)
|
||||
select
|
||||
cte.outlet
|
||||
,cte.stories
|
||||
,bias.outlet
|
||||
,bias.lean
|
||||
,sum(100 * (cte.stories / cast(total.total as float))) over() as rep
|
||||
,total.total
|
||||
from cte
|
||||
join bias
|
||||
on jaro_winkler_similarity(bias.outlet, cte.outlet) > 0.9
|
||||
cross join total.total
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
select
|
||||
outlet
|
||||
,count(1) as stories
|
||||
from stories
|
||||
group by outlet
|
||||
order by count(1) desc
|
||||
limit 50
|
||||
""")
|
||||
|
||||
outlets
|
|
@ -59,27 +59,28 @@ def download(output_dir):
|
|||
|
||||
|
||||
@cli.command()
|
||||
@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum")
|
||||
@click.option('-o', '--output_dir', type=Path, default=data_dir())
|
||||
@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum", show_default=True)
|
||||
@click.option('-o', '--output_dir', type=Path, default=data_dir(), show_default=True)
|
||||
def parse(directory, output_dir):
|
||||
"""parse the html files on disk into a structured csv format."""
|
||||
directory = data_dir() / "memeorandum"
|
||||
parser = etree.HTMLParser()
|
||||
pages = [f for f in directory.glob("*.html")]
|
||||
published = []
|
||||
others = []
|
||||
#page = pages[0]
|
||||
# page = pages[0]
|
||||
page_iter = tqdm(pages, postfix="starting")
|
||||
for page in page_iter:
|
||||
page_iter.set_postfix_str(f"{page}")
|
||||
date = datetime.datetime.strptime(page.stem, '%y-%m-%d')
|
||||
# tree = etree.parse(str(page), parser)
|
||||
tree = etree.parse(str(page), parser)
|
||||
root = tree.getroot()
|
||||
if not root:
|
||||
if root is None:
|
||||
print(f"error opening {page}")
|
||||
continue
|
||||
items = root.xpath("//div[contains(@class, 'item')]")
|
||||
|
||||
# item = items[0]
|
||||
for item in items:
|
||||
out = dict()
|
||||
citation = item.xpath('./cite')
|
||||
|
@ -92,16 +93,24 @@ def parse(directory, output_dir):
|
|||
author = ''
|
||||
out['author'] = author
|
||||
try:
|
||||
url = citation[0].getchildren()[0].get('href')
|
||||
publisher_url = citation[0].getchildren()[0].get('href')
|
||||
publisher = citation[0].getchildren()[0].text
|
||||
except IndexError as e:
|
||||
print(f"error with citation url: {page}")
|
||||
out['publisher'] = publisher
|
||||
out['publisher_url'] = url
|
||||
out['publisher_url'] = publisher_url
|
||||
|
||||
title = item.xpath('.//strong/a')[0].text
|
||||
out['title'] = title
|
||||
item_id = hash((title,page.stem,url))
|
||||
|
||||
url = item.xpath('.//strong/a')[0].get('href')
|
||||
out['url'] = url
|
||||
|
||||
item_id = hash((page.stem, url))
|
||||
out['id'] = item_id
|
||||
|
||||
old_id = hash((title, page.stem, publisher_url))
|
||||
out['old_id'] = old_id
|
||||
published.append(out)
|
||||
|
||||
related = item.xpath(".//span[contains(@class, 'mls')]/a")
|
||||
|
@ -113,9 +122,22 @@ def parse(directory, output_dir):
|
|||
another['parent_id'] = item_id
|
||||
others.append(another)
|
||||
df = pd.DataFrame(published)
|
||||
df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
|
||||
df.to_csv(output_dir / 'stories_v2.csv', sep='|', index=False)
|
||||
df = pd.DataFrame(others)
|
||||
df.to_csv(output_dir / 'related.csv', sep='|', index=False)
|
||||
df.to_csv(output_dir / 'related_v2.csv', sep='|', index=False)
|
||||
|
||||
@cli.command()
|
||||
def normalize():
|
||||
DB = connect()
|
||||
DB.sql("""
|
||||
create table publishers as
|
||||
select
|
||||
row_number() over(order by publisher) as id
|
||||
,publisher
|
||||
,publisher_url
|
||||
from stories
|
||||
group by publisher, publisher_url
|
||||
""")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
25
src/word.py
25
src/word.py
|
@ -3,28 +3,49 @@ from scipy.spatial import distance
|
|||
from transformers import AutoTokenizer, RobertaModel
|
||||
import numpy as np
|
||||
from model import Model
|
||||
from data import Data, from_db
|
||||
from data import Data, from_db, connect
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
...
|
||||
|
||||
@cli.command()
|
||||
def max_sequence():
|
||||
db = connect()
|
||||
longest = db.sql("""
|
||||
select
|
||||
title
|
||||
from stories
|
||||
order by length(title) desc
|
||||
limit 5000
|
||||
""").df()
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
||||
tokens = tokenizer(longest['title'].to_list())
|
||||
print(f"{max([len(x) for x in tokens['input_ids']])}")
|
||||
|
||||
@cli.command()
|
||||
def train():
|
||||
table = from_db(Data.Titles)
|
||||
|
||||
|
||||
n_classes = 10
|
||||
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
||||
model = RobertaModel.from_pretrained("roberta-base")
|
||||
|
||||
def get_embeddings(titles):
|
||||
# create tokens, padding to max width
|
||||
tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt")
|
||||
tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=70, return_attention_mask = True, return_tensors = "pt")
|
||||
outputs = model(**tokens)
|
||||
return outputs.last_hidden_state[:, 0, :]
|
||||
|
||||
titles = table['title'].apply(str).to_list()[:10]
|
||||
get_embeddings(titles)
|
||||
|
||||
outputs.last_hidden_state[0][200:]
|
||||
outputs.values().shape
|
||||
model
|
||||
|
||||
# linear = torch.nn.Linear(model.config.hidden_size, n_classes)
|
||||
# act = torch.nn.Sigmoid()
|
||||
|
||||
|
|
Loading…
Reference in New Issue