add progress and better scraping.

2023-04-22 13:00:24 -07:00 · 2023-04-22 13:00:24 -07:00 · d43ed4658a
parent 297aeec32d
commit d43ed4658a
7 changed files with 287 additions and 73 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 *.csv
 *.swp
 __pycache__
+tmp.py
--- a/docs/progress.md
+++ b/docs/progress.md
@ -1,31 +1,39 @@
 # Data Mining - CSCI 577

-# Project Status Report I
+# Project Status Report III

-*2023-04-04*
+*2023-04-18*

 ## Participants

 Matt Jensen

-## Overarching Purpose
+Computer Science 477/577
+Project Status Report III
+Due: Tuesday, April 18

-I hope to use a dataset of new articles to track the polarization of news over time.
-I have a hypothesis that news has become more polarized superficially, but has actually converged into only two dominate views points.
-I think there is a connection to be made to other statistics, like voting polarity in congress, or income inequality, or consolidation of media into the hands of the few.
+## Tools

-## Data Source
+> The third project progress report should include a preliminary account of the existing software tools you will be using.
+> Ideally, you obtain the software you will (probably) need and run it on sample files (or your real files), so make sure that you understand how they work.
+> Do not wait verify that there are no hidden complications.
+> The are many plausible sources for such software, including the following:

-To test this thesis, I will crawl the archives of [memeorandum.com](https://www.memeorandum.com/) for news stories from 2006 onward.
-I will grab the title, author, publisher, published date, url and related discussions and store it in a .csv.
-The site also has a concept of references, where a main, popular story may be covered by other sources.
-So there is a concept of link similarity that could be explored in this analysis too.
+I will use the following suite of python tools to conduct my research:

-## Techniques
+- python
+- pytorch
+- scikit-learn
+- duckdb
+- requests
+- pandas
+- matplotlib
+- seaborn

-I am unsure of which technique specifically will work best, but I believe an unsupervised clustering algorithm will serve me well.
-I think there is a way to test the ideal number of clusters should exist to minimize the error.
-This could be a good proxy for how many 'viewpoints' are allowed in 'mainstream' news media.
+## Purpose
+
+> This progress should also provide a definitive description of your purpose and how you intend to conduct it.
+> This should take the form of a detailed outline of the procedures you will undertake in exploring your dataset(s) and maximizing the knowledge that can be extracted from it.

 \newpage 

@ -103,3 +111,31 @@ Another goal is to look at the political alignment over time.
 I will train a classifier to predict political bias based on the word embeddings as well.
 There is a concept of the [Overton Window](https://en.wikipedia.org/wiki/Overton_window) and I would be curious to know if title of new articles could be a proxy for the location of the overton window over time.

+\newpage 
+
+# Project Status Report I
+
+*2023-04-04*
+
+## Participants
+
+Matt Jensen
+
+## Overarching Purpose
+
+I hope to use a dataset of new articles to track the polarization of news over time.
+I have a hypothesis that news has become more polarized superficially, but has actually converged into only two dominate views points.
+I think there is a connection to be made to other statistics, like voting polarity in congress, or income inequality, or consolidation of media into the hands of the few.
+
+## Data Source
+
+To test this thesis, I will crawl the archives of [memeorandum.com](https://www.memeorandum.com/) for news stories from 2006 onward.
+I will grab the title, author, publisher, published date, url and related discussions and store it in a .csv.
+The site also has a concept of references, where a main, popular story may be covered by other sources.
+So there is a concept of link similarity that could be explored in this analysis too.
+
+## Techniques
+
+I am unsure of which technique specifically will work best, but I believe an unsupervised clustering algorithm will serve me well.
+I think there is a way to test the ideal number of clusters should exist to minimize the error.
+This could be a good proxy for how many 'viewpoints' are allowed in 'mainstream' news media.
--- a/docs/progress_spec_3.docx
+++ b/docs/progress_spec_3.docx
--- a/src/bias.py
+++ b/src/bias.py
@ -0,0 +1,180 @@
+import click
+from data import connect
+import pandas as pd
+from lxml import etree
+from pathlib import Path
+import os
+import csv
+
+@click.group()
+def cli() -> None:
+    ...
+
+def map(rating:str) -> int:
+    mapping = {
+        'right' : 0,
+        'left-center' : 1,
+        'center' : 2,
+        'left' : 3,
+        'allsides' : 4,
+        'right-center' : 5
+    }
+    return mapping[rating]
+
+
+@cli.command()
+def load() -> None:
+    DB = connect()
+    DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
+    f = str(DATA_DIR / "bias_ratings.csv")
+
+    DB.sql(f"""
+        create table bias_ratings as 
+        select 
+            row_number() over(order by b.publisher) as id
+            ,b.*
+        from read_csv_auto('{f}') b
+    """)
+@cli.command()
+def join() -> None:
+    DB = connect()
+
+    DB.sql("""
+        with cte as (
+            select 
+                s.publisher
+                ,count(1) as stories
+            from stories s
+            group by s.publisher
+        )
+        select
+            s.publisher
+            ,s.stories
+            ,b.publisher
+            ,b.bias
+        from bias_ratings b
+        join cte s
+        on s.publisher = b.publisher
+        order by
+        stories desc
+        limit 15
+    """)
+
+    DB.sql("""
+        with cte as (
+            select 
+                s.publisher
+                ,count(1) as stories
+            from stories s
+            group by s.publisher
+        )
+        select
+            sum(stories)
+            ,avg(agree / disagree)
+        from bias_ratings b
+        join cte s
+        on s.publisher = b.publisher
+    """)
+
+    DB.sql("""
+        with cte as (
+            select 
+                s.publisher
+                ,count(1) as stories
+            from stories s
+            group by s.publisher
+        )
+        select
+            sum(s.stories) filter(where b.publisher is not null) as matched
+            ,sum(s.stories) filter(where b.publisher is null) as unmatched
+            ,cast(sum(s.stories) filter(where b.publisher is not null) as numeric)
+            / sum(s.stories) filter(where b.publisher is null) as precent_matched
+        from bias_ratings b
+        right join cte s
+        on s.publisher = b.publisher
+    """)
+
+    DB.sql("""
+        select
+        *
+        from bias_ratings
+        where publisher ilike '%CNN%'
+        """)
+
+@cli.command()
+def debug() -> None:
+    DB = connect()
+    DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
+    f = str(DATA_DIR / "bias_ratings.csv")
+
+    DB.sql("""
+        with cte as (
+            select 
+                outlet 
+                ,count(1) as stories
+                from stories 
+                group by outlet
+        ) 
+        ,total as (
+            select
+                sum(stories) as total
+            from cte
+        )
+        select
+            cte.outlet
+            ,cte.stories
+            ,bias.outlet
+            ,bias.lean
+            ,sum(100 * (cte.stories / cast(total.total as float))) over() as rep
+            ,total.total
+        from cte
+        join bias 
+        on jaro_winkler_similarity(bias.outlet, cte.outlet) > 0.9
+        cross join total.total
+    """)
+
+    DB.sql("""
+        select
+            outlet
+            ,count(1) as stories
+        from stories
+        group by outlet
+        order by count(1) desc
+        limit 50
+    """)
+
+    outlets
+
+@cli.command()
+def parse_html() -> None:
+    """parse the save html page of allslides.com bias ratings into a normalized csv file"""
+    DB = connect()
+    DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
+    bias_html = DATA_DIR / 'allsides.html'
+
+    parser = etree.HTMLParser()
+    tree = etree.parse(str(bias_html), parser)
+    root = tree.getroot()
+    rows = root.xpath('//table[contains(@class,"views-table")]/tbody/tr')
+
+    ratings = []
+    for row in rows:
+        rating = dict()
+        publisher = row.xpath('./td[contains(@class, "source-title")]/a')[0].text
+        rating['publisher'] = publisher
+
+        bias = row.xpath('./td[contains(@class, "views-field-field-bias-image")]/a')[0].get('href')
+        bias = bias.split('/')[-1]
+        rating['bias'] = bias
+
+        agree = row.xpath('.//span[contains(@class, "agree")]')[0].text
+        disagree = row.xpath('.//span[contains(@class, "disagree")]')[0].text
+
+        rating['agree'] = int(agree)
+        rating['disagree'] = int(disagree)
+        ratings.append(rating)
+    df = pd.DataFrame(ratings)
+    df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
+
+if __name__ == "__main__":
+    cli()
--- a/src/join_bias.py
+++ b/src/join_bias.py
@ -1,46 +0,0 @@
-import click
-import duckdb
-from data import connect
-import polars as ps
-
-DB = connect()
-DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
-bias = ps.read_csv(DATA_DIR / 'allsides_bias.csv', sep="|")
-
-DB.sql("""
-    with cte as (
-        select 
-            outlet 
-            ,count(1) as stories
-            from stories 
-            group by outlet
-    ) 
-    ,total as (
-        select
-            sum(stories) as total
-        from cte
-    )
-    select
-        cte.outlet
-        ,cte.stories
-        ,bias.outlet
-        ,bias.lean
-        ,sum(100 * (cte.stories / cast(total.total as float))) over() as rep
-        ,total.total
-    from cte
-    join bias 
-    on jaro_winkler_similarity(bias.outlet, cte.outlet) > 0.9
-    cross join total.total
-""")
-
-DB.sql("""
-    select
-        outlet
-        ,count(1) as stories
-    from stories
-    group by outlet
-    order by count(1) desc
-    limit 50
-""")
-
-outlets
--- a/src/scrape.py
+++ b/src/scrape.py
@ -59,27 +59,28 @@ def download(output_dir):


@cli.command()
-@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum")
-@click.option('-o', '--output_dir', type=Path, default=data_dir())
+@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum", show_default=True)
+@click.option('-o', '--output_dir', type=Path, default=data_dir(), show_default=True)
 def parse(directory, output_dir):
+    """parse the html files on disk into a structured csv format."""
    directory = data_dir() / "memeorandum"
    parser = etree.HTMLParser()
    pages = [f for f in directory.glob("*.html")]
    published = []
    others = []
-    #page = pages[0]
+    # page = pages[0]
    page_iter = tqdm(pages, postfix="starting")
    for page in page_iter:
        page_iter.set_postfix_str(f"{page}")
        date = datetime.datetime.strptime(page.stem, '%y-%m-%d')
-        # tree = etree.parse(str(page), parser)
        tree = etree.parse(str(page), parser)
        root = tree.getroot()
-        if not root:
+        if root is None:
            print(f"error opening {page}")
            continue
        items = root.xpath("//div[contains(@class, 'item')]")

+        # item = items[0]
        for item in items:
            out = dict()
            citation = item.xpath('./cite')
@ -92,16 +93,24 @@ def parse(directory, output_dir):
                author = ''
            out['author'] = author
            try:
-                url = citation[0].getchildren()[0].get('href')
+                publisher_url = citation[0].getchildren()[0].get('href')
                publisher = citation[0].getchildren()[0].text
            except IndexError as e:
                print(f"error with citation url: {page}")
            out['publisher'] = publisher
-            out['publisher_url'] = url
+            out['publisher_url'] = publisher_url
+
            title = item.xpath('.//strong/a')[0].text
            out['title'] = title
-            item_id = hash((title,page.stem,url))
+
+            url = item.xpath('.//strong/a')[0].get('href')
+            out['url'] = url
+
+            item_id = hash((page.stem, url))
            out['id'] = item_id
+
+            old_id = hash((title, page.stem, publisher_url))
+            out['old_id'] = old_id
            published.append(out)

            related = item.xpath(".//span[contains(@class, 'mls')]/a")
@ -113,9 +122,22 @@ def parse(directory, output_dir):
                another['parent_id'] = item_id
                others.append(another)
    df = pd.DataFrame(published)
-    df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
+    df.to_csv(output_dir / 'stories_v2.csv', sep='|', index=False)
    df = pd.DataFrame(others)
-    df.to_csv(output_dir / 'related.csv', sep='|', index=False)
+    df.to_csv(output_dir / 'related_v2.csv', sep='|', index=False)
+
+@cli.command()
+def normalize():
+    DB = connect()
+    DB.sql("""
+        create table publishers as
+        select
+            row_number() over(order by publisher) as id
+            ,publisher
+            ,publisher_url
+        from stories
+        group by publisher, publisher_url
+    """)


 if __name__ == "__main__":
--- a/src/word.py
+++ b/src/word.py
@ -3,28 +3,49 @@ from scipy.spatial import distance
 from transformers import AutoTokenizer, RobertaModel
 import numpy as np
 from model import Model
-from data import Data, from_db
+from data import Data, from_db, connect

@click.group()
 def cli():
    ...

+@cli.command()
+def max_sequence():
+    db = connect()
+    longest = db.sql("""
+        select
+            title
+        from stories
+        order by length(title) desc
+        limit 5000
+    """).df()
+
+    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
+    tokens = tokenizer(longest['title'].to_list())
+    print(f"{max([len(x) for x in tokens['input_ids']])}")
+
@cli.command()
 def train():
    table = from_db(Data.Titles)
+
+
    n_classes = 10
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    model = RobertaModel.from_pretrained("roberta-base")

    def get_embeddings(titles):
        # create tokens, padding to max width 
-        tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt")
+        tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=70, return_attention_mask = True, return_tensors = "pt")
        outputs = model(**tokens)
        return outputs.last_hidden_state[:, 0, :]

    titles = table['title'].apply(str).to_list()[:10]
    get_embeddings(titles)

+    outputs.last_hidden_state[0][200:]
+    outputs.values().shape
+    model
+
    # linear = torch.nn.Linear(model.config.hidden_size, n_classes)
    # act = torch.nn.Sigmoid()