diff --git a/.gitignore b/.gitignore index c682433..cef562a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *.csv *.swp +__pycache__ diff --git a/dist/jensen_577_progress_report.pdf b/dist/jensen_577_progress_report.pdf new file mode 100644 index 0000000..7160ded Binary files /dev/null and b/dist/jensen_577_progress_report.pdf differ diff --git a/docs/progress.md b/docs/progress.md new file mode 100644 index 0000000..0ad60de --- /dev/null +++ b/docs/progress.md @@ -0,0 +1,105 @@ +# Data Mining - CSCI 577 + +# Project Status Report I + +*2023-04-04* + +## Participants + +Matt Jensen + +## Overarching Purpose + +I hope to use a dataset of new articles to track the polarization of news over time. +I have a hypothesis that news has become more polarized superficially, but has actually converged into only two dominate views points. +I think there is a connection to be made to other statistics, like voting polarity in congress, or income inequality, or consolidation of media into the hands of the few. + +## Data Source + +To test this thesis, I will crawl the archives of [memeorandum.com](https://www.memeorandum.com/) for news stories from 2006 onward. +I will grab the title, author, publisher, published date, url and related discussions and store it in a .csv. +The site also has a concept of references, where a main, popular story may be covered by other sources. +So there is a concept of link similarity that could be explored in this analysis too. + +## Techniques + +I am unsure of which technique specifically will work best, but I believe an unsupervised clustering algorithm will serve me well. +I think there is a way to test the ideal number of clusters should exist to minimize the error. +This could be a good proxy for how many 'viewpoints' are allowed in 'mainstream' news media. + +\newpage + +# Project Status Report II + +*2023-04-11* + +## Participants + +Matt Jensen + +## Dataset Description + +The dataset I will be using for my analysis has the following attributes: + +- title + - a text description of the news item. + - discrete, nominal. + - ~800k distinct titles. +- url + - a text description and unique identifier for the news item. + - discrete, nominal. + - ~700k distinct urls. +- author + - a text name. + - discrete, nominal. + - ~42k distinct authors. +- publisher + - a text name. + - discrete, nominal. + - ~13k distinct outlets. +- related links + - an adjacency matrix with the number of common links between two publishers. + - continuous, ratio. + - counts are less than total number of stories, obviously. +- published date + - the date the article was published. + - continuous, interval. + - ~5.5k distinct dates. + +In addition, I will augment the data with the following attributes: + +- title word embedding + - a vectorized form of the title from the output of a LLM or BERT model which embeds semantic meaning into the sentence. + - continuous, nominal. + - 800k vectors, of 768 values. +- political bias of the publisher + - a measure of how voters feel the political leanings of the publisher map to the political parties (Democrat/Republican). + - continuous, ordinal. + - ~30% of the publishers are labelled in [allsides.com](https://www.allsides.com/media-bias/ratings) ratings. +- estimated viewership of the publisher + - an estimate of the size of the audience that consumes the publisher's media. + - continous, ratio. + - I still need to parse [The Future of Media Project](https://projects.iq.harvard.edu/futureofmedia/index-us-mainstream-media-ownership) data to get a good idea of this number. +- number of broken links + - I will navigate all the links and count the number of 200, 301 and 404 status codes return. + - discrete, nominal + - size of this dataset is still unknown. + +## Purpose + +I want to analyze data from the news aggregation site [memeorandum.com](https://www.memeorandum.com/) and combine it with media bias measurements from [allsides.com](https://www.allsides.com/media-bias/ratings). +My goal for the project is to cluster the data based on the word embeddings of the titles. +I will tokenize each title, and use a BERT style model to generate word embeddings from the token. + +Word embedding output from language models encode semantic meaning of sentences. +Specifically, BERT models output embeddings of 768 dimensional space. +Clustering these vectors will map from this semantic space to a lower dimensional cluster space. + +My understanding of cluster leads me to believe that this lower dimensional space encodes meaning like similarity. +In this way, I hope to find outlets that tend to publish similar stories and group them together. +I would guess that this lower dimensional space will reflect story quantity and political leanings. +I would expect new outlets with similar quantity of stories and political leanings to be grouped together. +Another goal is to look at the political alignment over time. +I will train a classifier to predict political bias based on the word embeddings as well. +There is a concept of the [Overton Window](https://en.wikipedia.org/wiki/Overton_window) and I would be curious to know if title of new articles could be a proxy for the location of the overton window over time. + diff --git a/docs/progress_spec_1.docx b/docs/progress_spec_1.docx new file mode 100644 index 0000000..6379e39 Binary files /dev/null and b/docs/progress_spec_1.docx differ diff --git a/docs/progress_spec_2.docx b/docs/progress_spec_2.docx new file mode 100644 index 0000000..8ab6bf2 Binary files /dev/null and b/docs/progress_spec_2.docx differ diff --git a/src/broken_links.py b/src/broken_links.py new file mode 100644 index 0000000..565b9d4 --- /dev/null +++ b/src/broken_links.py @@ -0,0 +1,23 @@ +import requests +import click +from data import connect +import seaborn as sns +import matplotlib.pyplot as plt + +DB = connect() + +DB.sql(""" +with cte as ( + select + count(1) as cnt + from stories + group by url, outlet +) +select + cast(sum(cnt) filter (where cnt = 1) as float) + / sum(cnt) filter (where cnt > 1) as dups +from cte +""") + +sns.histplot(x=hist['cnt']) +plt.show() diff --git a/src/data.py b/src/data.py index c55324c..e59878c 100644 --- a/src/data.py +++ b/src/data.py @@ -6,6 +6,15 @@ from enum import Enum class Data(str, Enum): Titles = 'titles' +def data_dir(): + return Path(os.environ['DATA_MINING_DATA_DIR']) + +def connect(): + DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) + # APP_DIR = Path(os.environ['DATA_MINING_APP_DIR']) + DB = duckdb.connect(str(DATA_DIR / 'project.duckdb')) + return DB + def from_db(t: Data): DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) # APP_DIR = Path(os.environ['DATA_MINING_APP_DIR']) diff --git a/src/join_bias.py b/src/join_bias.py new file mode 100644 index 0000000..a268f51 --- /dev/null +++ b/src/join_bias.py @@ -0,0 +1,46 @@ +import click +import duckdb +from data import connect +import polars as ps + +DB = connect() +DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) +bias = ps.read_csv(DATA_DIR / 'allsides_bias.csv', sep="|") + +DB.sql(""" + with cte as ( + select + outlet + ,count(1) as stories + from stories + group by outlet + ) + ,total as ( + select + sum(stories) as total + from cte + ) + select + cte.outlet + ,cte.stories + ,bias.outlet + ,bias.lean + ,sum(100 * (cte.stories / cast(total.total as float))) over() as rep + ,total.total + from cte + join bias + on jaro_winkler_similarity(bias.outlet, cte.outlet) > 0.9 + cross join total.total +""") + +DB.sql(""" + select + outlet + ,count(1) as stories + from stories + group by outlet + order by count(1) desc + limit 50 +""") + +outlets diff --git a/src/scrape.py b/src/scrape.py new file mode 100644 index 0000000..bff666f --- /dev/null +++ b/src/scrape.py @@ -0,0 +1,92 @@ +from datetime import date, timedelta +import datetime +import requests +from pathlib import Path +import click +from tqdm import tqdm +from data import data_dir +from lxml import etree +import pandas as pd + +@click.group() +def cli(): + ... + +@cli.command() +@click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum") +def download(output_dir): + day = timedelta(days=1) + cur = date(2005, 10, 1) + end = date.today() + dates = [] + while cur <= end: + dates.append(cur) + cur = cur + day + date_iter = tqdm(dates, postfix="test") + for i in date_iter: + date_iter.set_postfix_str(f"{i}") + save_as = output_dir / f"{i.strftime('%y-%m-%d')}.html" + if save_as.exists(): + continue + url = f"https://www.memeorandum.com/{i.strftime('%y%m%d')}/h2000" + r = requests.get(url) + with open(save_as, 'w') as f: + f.write(r.text) + + +@cli.command() +@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum") +@click.option('-o', '--output_dir', type=Path, default=data_dir()) +def parse(directory, output_dir): + directory = data_dir() / "memeorandum" + parser = etree.HTMLParser() + pages = [f for f in directory.glob("*.html")] + published = [] + others = [] + #page = pages[0] + page_iter = tqdm(pages, postfix="starting") + for page in page_iter: + page_iter.set_postfix_str(f"{page}") + date = datetime.datetime.strptime(page.stem, '%y-%m-%d') + # tree = etree.parse(str(page), parser) + tree = etree.parse(str(page), parser) + root = tree.getroot() + items = root.xpath("//div[contains(@class, 'item')]") + + for item in items: + out = dict() + citation = item.xpath('./cite') + if not citation: + continue + author = citation[0] + if author.text: + author = ''.join(author.text.split('/')[:-1]).strip() + else: + author = '' + out['author'] = author + url = citation[0].getchildren()[0].get('href') + publisher = citation[0].getchildren()[0].text + out['publisher'] = publisher + out['publisher_url'] = url + title = item.xpath('.//strong/a')[0].text + out['title'] = title + item_id = hash((title,page.stem,url)) + out['id'] = item_id + published.append(out) + + related = item.xpath(".//span[contains(@class, 'mls')]/a") + # relation = related[0] + for relation in related: + another = dict() + another['url'] = relation.get('href') + another['publisher'] = relation.text + another['parent_id'] = item_id + others.append(another) + df = pd.DataFrame(published) + df.to_csv(output_dir / 'stories.csv', sep='|', index=False) + df = pd.DataFrame(others) + df.to_csv(output_dir / 'related.csv', sep='|', index=False) + + +if __name__ == "__main__": + cli()