add progress and better scraping.

2023-04-22 13:00:24 -07:00 · 2023-04-22 13:00:24 -07:00 · d43ed4658a
parent 297aeec32d
commit d43ed4658a
7 changed files with 287 additions and 73 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 *.csv
 *.swp
 __pycache__
 tmp.py
--- a/docs/progress.md
+++ b/docs/progress.md
@ -1,31 +1,39 @@
 # Data Mining - CSCI 577
-# Project Status Report I
+# Project Status Report III
-*2023-04-04*
+*2023-04-18*
 ## Participants
 Matt Jensen
-## Overarching Purpose
+Computer Science 477/577
 Project Status Report III
 Due: Tuesday, April 18
-I hope to use a dataset of new articles to track the polarization of news over time.
+## Tools
 I have a hypothesis that news has become more polarized superficially, but has actually converged into only two dominate views points.
 I think there is a connection to be made to other statistics, like voting polarity in congress, or income inequality, or consolidation of media into the hands of the few.
-## Data Source
+> The third project progress report should include a preliminary account of the existing software tools you will be using.
 > Ideally, you obtain the software you will (probably) need and run it on sample files (or your real files), so make sure that you understand how they work.
 > Do not wait verify that there are no hidden complications.
 > The are many plausible sources for such software, including the following:
-To test this thesis, I will crawl the archives of [memeorandum.com](https://www.memeorandum.com/) for news stories from 2006 onward.
+I will use the following suite of python tools to conduct my research:
 I will grab the title, author, publisher, published date, url and related discussions and store it in a .csv.
 The site also has a concept of references, where a main, popular story may be covered by other sources.
 So there is a concept of link similarity that could be explored in this analysis too.
-## Techniques
+- python
 - pytorch
 - scikit-learn
 - duckdb
 - requests
 - pandas
 - matplotlib
 - seaborn
-I am unsure of which technique specifically will work best, but I believe an unsupervised clustering algorithm will serve me well.
+## Purpose
-I think there is a way to test the ideal number of clusters should exist to minimize the error.
+
-This could be a good proxy for how many 'viewpoints' are allowed in 'mainstream' news media.
+> This progress should also provide a definitive description of your purpose and how you intend to conduct it.
 > This should take the form of a detailed outline of the procedures you will undertake in exploring your dataset(s) and maximizing the knowledge that can be extracted from it.
 \newpage 
@ -103,3 +111,31 @@ Another goal is to look at the political alignment over time.
 I will train a classifier to predict political bias based on the word embeddings as well.
 There is a concept of the [Overton Window](https://en.wikipedia.org/wiki/Overton_window) and I would be curious to know if title of new articles could be a proxy for the location of the overton window over time.
 \newpage 
 # Project Status Report I
 *2023-04-04*
 ## Participants
 Matt Jensen
 ## Overarching Purpose
 I hope to use a dataset of new articles to track the polarization of news over time.
 I have a hypothesis that news has become more polarized superficially, but has actually converged into only two dominate views points.
 I think there is a connection to be made to other statistics, like voting polarity in congress, or income inequality, or consolidation of media into the hands of the few.
 ## Data Source
 To test this thesis, I will crawl the archives of [memeorandum.com](https://www.memeorandum.com/) for news stories from 2006 onward.
 I will grab the title, author, publisher, published date, url and related discussions and store it in a .csv.
 The site also has a concept of references, where a main, popular story may be covered by other sources.
 So there is a concept of link similarity that could be explored in this analysis too.
 ## Techniques
 I am unsure of which technique specifically will work best, but I believe an unsupervised clustering algorithm will serve me well.
 I think there is a way to test the ideal number of clusters should exist to minimize the error.
 This could be a good proxy for how many 'viewpoints' are allowed in 'mainstream' news media.
--- a/docs/progress_spec_3.docx
+++ b/docs/progress_spec_3.docx
--- a/src/bias.py
+++ b/src/bias.py
@ -0,0 +1,180 @@
 import click
 from data import connect
 import pandas as pd
 from lxml import etree
 from pathlib import Path
 import os
 import csv
@click.group()
 def cli() -> None:
    ...
 def map(rating:str) -> int:
    mapping = {
        'right' : 0,
        'left-center' : 1,
        'center' : 2,
        'left' : 3,
        'allsides' : 4,
        'right-center' : 5
    }
    return mapping[rating]
@cli.command()
 def load() -> None:
    DB = connect()
    DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
    f = str(DATA_DIR / "bias_ratings.csv")
    DB.sql(f"""
        create table bias_ratings as 
        select 
            row_number() over(order by b.publisher) as id
            ,b.*
        from read_csv_auto('{f}') b
    """)
@cli.command()
 def join() -> None:
    DB = connect()
    DB.sql("""
        with cte as (
            select 
                s.publisher
                ,count(1) as stories
            from stories s
            group by s.publisher
        )
        select
            s.publisher
            ,s.stories
            ,b.publisher
            ,b.bias
        from bias_ratings b
        join cte s
        on s.publisher = b.publisher
        order by
        stories desc
        limit 15
    """)
    DB.sql("""
        with cte as (
            select 
                s.publisher
                ,count(1) as stories
            from stories s
            group by s.publisher
        )
        select
            sum(stories)
            ,avg(agree / disagree)
        from bias_ratings b
        join cte s
        on s.publisher = b.publisher
    """)
    DB.sql("""
        with cte as (
            select 
                s.publisher
                ,count(1) as stories
            from stories s
            group by s.publisher
        )
        select
            sum(s.stories) filter(where b.publisher is not null) as matched
            ,sum(s.stories) filter(where b.publisher is null) as unmatched
            ,cast(sum(s.stories) filter(where b.publisher is not null) as numeric)
            / sum(s.stories) filter(where b.publisher is null) as precent_matched
        from bias_ratings b
        right join cte s
        on s.publisher = b.publisher
    """)
    DB.sql("""
        select
        *
        from bias_ratings
        where publisher ilike '%CNN%'
        """)
@cli.command()
 def debug() -> None:
    DB = connect()
    DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
    f = str(DATA_DIR / "bias_ratings.csv")
    DB.sql("""
        with cte as (
            select 
                outlet 
                ,count(1) as stories
                from stories 
                group by outlet
        ) 
        ,total as (
            select
                sum(stories) as total
            from cte
        )
        select
            cte.outlet
            ,cte.stories
            ,bias.outlet
            ,bias.lean
            ,sum(100 * (cte.stories / cast(total.total as float))) over() as rep
            ,total.total
        from cte
        join bias 
        on jaro_winkler_similarity(bias.outlet, cte.outlet) > 0.9
        cross join total.total
    """)
    DB.sql("""
        select
            outlet
            ,count(1) as stories
        from stories
        group by outlet
        order by count(1) desc
        limit 50
    """)
    outlets
@cli.command()
 def parse_html() -> None:
    """parse the save html page of allslides.com bias ratings into a normalized csv file"""
    DB = connect()
    DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
    bias_html = DATA_DIR / 'allsides.html'
    parser = etree.HTMLParser()
    tree = etree.parse(str(bias_html), parser)
    root = tree.getroot()
    rows = root.xpath('//table[contains(@class,"views-table")]/tbody/tr')
    ratings = []
    for row in rows:
        rating = dict()
        publisher = row.xpath('./td[contains(@class, "source-title")]/a')[0].text
        rating['publisher'] = publisher
        bias = row.xpath('./td[contains(@class, "views-field-field-bias-image")]/a')[0].get('href')
        bias = bias.split('/')[-1]
        rating['bias'] = bias
        agree = row.xpath('.//span[contains(@class, "agree")]')[0].text
        disagree = row.xpath('.//span[contains(@class, "disagree")]')[0].text
        rating['agree'] = int(agree)
        rating['disagree'] = int(disagree)
        ratings.append(rating)
    df = pd.DataFrame(ratings)
    df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
 if __name__ == "__main__":
    cli()
--- a/src/join_bias.py
+++ b/src/join_bias.py
@ -1,46 +0,0 @@
 import click
 import duckdb
 from data import connect
 import polars as ps
 DB = connect()
 DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
 bias = ps.read_csv(DATA_DIR / 'allsides_bias.csv', sep="|")
 DB.sql("""
    with cte as (
        select 
            outlet 
            ,count(1) as stories
            from stories 
            group by outlet
    ) 
    ,total as (
        select
            sum(stories) as total
        from cte
    )
    select
        cte.outlet
        ,cte.stories
        ,bias.outlet
        ,bias.lean
        ,sum(100 * (cte.stories / cast(total.total as float))) over() as rep
        ,total.total
    from cte
    join bias 
    on jaro_winkler_similarity(bias.outlet, cte.outlet) > 0.9
    cross join total.total
 """)
 DB.sql("""
    select
        outlet
        ,count(1) as stories
    from stories
    group by outlet
    order by count(1) desc
    limit 50
 """)
 outlets
--- a/src/scrape.py
+++ b/src/scrape.py
@ -59,27 +59,28 @@ def download(output_dir):
@cli.command()
-@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum")
+@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum", show_default=True)
-@click.option('-o', '--output_dir', type=Path, default=data_dir())
+@click.option('-o', '--output_dir', type=Path, default=data_dir(), show_default=True)
 def parse(directory, output_dir):
    """parse the html files on disk into a structured csv format."""
    directory = data_dir() / "memeorandum"
    parser = etree.HTMLParser()
    pages = [f for f in directory.glob("*.html")]
    published = []
    others = []
-    #page = pages[0]
+    # page = pages[0]
    page_iter = tqdm(pages, postfix="starting")
    for page in page_iter:
        page_iter.set_postfix_str(f"{page}")
        date = datetime.datetime.strptime(page.stem, '%y-%m-%d')
        # tree = etree.parse(str(page), parser)
        tree = etree.parse(str(page), parser)
        root = tree.getroot()
-        if not root:
+        if root is None:
            print(f"error opening {page}")
            continue
        items = root.xpath("//div[contains(@class, 'item')]")
        # item = items[0]
        for item in items:
            out = dict()
            citation = item.xpath('./cite')
@ -92,16 +93,24 @@ def parse(directory, output_dir):
                author = ''
            out['author'] = author
            try:
-                url = citation[0].getchildren()[0].get('href')
+                publisher_url = citation[0].getchildren()[0].get('href')
                publisher = citation[0].getchildren()[0].text
            except IndexError as e:
                print(f"error with citation url: {page}")
            out['publisher'] = publisher
-            out['publisher_url'] = url
+            out['publisher_url'] = publisher_url
            title = item.xpath('.//strong/a')[0].text
            out['title'] = title
-            item_id = hash((title,page.stem,url))
+
            url = item.xpath('.//strong/a')[0].get('href')
            out['url'] = url
            item_id = hash((page.stem, url))
            out['id'] = item_id
            old_id = hash((title, page.stem, publisher_url))
            out['old_id'] = old_id
            published.append(out)
            related = item.xpath(".//span[contains(@class, 'mls')]/a")
@ -113,9 +122,22 @@ def parse(directory, output_dir):
                another['parent_id'] = item_id
                others.append(another)
    df = pd.DataFrame(published)
-    df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
+    df.to_csv(output_dir / 'stories_v2.csv', sep='|', index=False)
    df = pd.DataFrame(others)
-    df.to_csv(output_dir / 'related.csv', sep='|', index=False)
+    df.to_csv(output_dir / 'related_v2.csv', sep='|', index=False)
@cli.command()
 def normalize():
    DB = connect()
    DB.sql("""
        create table publishers as
        select
            row_number() over(order by publisher) as id
            ,publisher
            ,publisher_url
        from stories
        group by publisher, publisher_url
    """)
 if __name__ == "__main__":
--- a/src/word.py
+++ b/src/word.py
@ -3,28 +3,49 @@ from scipy.spatial import distance
 from transformers import AutoTokenizer, RobertaModel
 import numpy as np
 from model import Model
-from data import Data, from_db
+from data import Data, from_db, connect
@click.group()
 def cli():
    ...
@cli.command()
 def max_sequence():
    db = connect()
    longest = db.sql("""
        select
            title
        from stories
        order by length(title) desc
        limit 5000
    """).df()
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    tokens = tokenizer(longest['title'].to_list())
    print(f"{max([len(x) for x in tokens['input_ids']])}")
@cli.command()
 def train():
    table = from_db(Data.Titles)
    n_classes = 10
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    model = RobertaModel.from_pretrained("roberta-base")
    def get_embeddings(titles):
        # create tokens, padding to max width 
-        tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt")
+        tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=70, return_attention_mask = True, return_tensors = "pt")
        outputs = model(**tokens)
        return outputs.last_hidden_state[:, 0, :]
    titles = table['title'].apply(str).to_list()[:10]
    get_embeddings(titles)
    outputs.last_hidden_state[0][200:]
    outputs.values().shape
    model
    # linear = torch.nn.Linear(model.config.hidden_size, n_classes)
    # act = torch.nn.Sigmoid()