v1.0 of presentation.

2023-05-17 13:38:07 -07:00
parent 4d93cf7adb
commit 74c2d8afa2
37 changed files with 1959 additions and 144 deletions
--- a/src/data/init.py
+++ b/src/data/init.py
@@ -0,0 +1,6 @@
+import data.main
+import data.scrape
+__all__ = [
+    'main'
+    ,'scrape'
+]
--- a/src/data/main.py
+++ b/src/data/main.py
@@ -0,0 +1,30 @@
+import os
+from pathlib import Path
+import duckdb
+from enum import Enum
+
+class Data(str, Enum):
+    Titles = 'titles'
+
+def data_dir():
+    return Path(os.environ['DATA_MINING_DATA_DIR'])
+
+def connect():
+    DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
+    # APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
+    DB = duckdb.connect(str(DATA_DIR / 'project.duckdb'))
+    return DB
+
+def from_db(t: Data):
+    DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
+    # APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
+    DB = duckdb.connect(str(DATA_DIR / 'project.duckdb'))
+    if t == Data.Titles:
+        table = DB.sql("""
+            select
+                distinct
+                title 
+            from stories
+            limit 100
+        """).df()
+    return table
--- a/src/data/scrape.py
+++ b/src/data/scrape.py
@@ -0,0 +1,337 @@
+from datetime import date, timedelta
+import datetime
+import requests
+from pathlib import Path
+import click
+from tqdm import tqdm
+from data.main import data_dir, connect
+from lxml import etree
+import pandas as pd
+from urllib.parse import urlparse
+from tld import get_tld
+from tld.utils import update_tld_names
+
+@click.command(name='scrape:load')
+@click.option('--directory', type=Path, default=data_dir(), show_default=True)
+@click.option('--database', type=Path, default=data_dir() / "stories.duckdb", show_default=True)
+def load(directory, database):
+    stories = directory / "stories.csv"
+    related = directory / "related.csv"
+    db = connect()
+
+    db.sql(f"""
+        CREATE OR REPLACE TABLE stories AS
+        SELECT 
+            * 
+        FROM read_csv_auto('{stories}')
+    """)
+
+    db.sql(f"""
+        CREATE OR REPLACE TABLE related_stories AS
+        SELECT 
+            * 
+        FROM read_csv_auto('{related}')
+    """)
+    db.close()
+
+@click.command(name='scrape:download')
+@click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum", show_default=True)
+def download(output_dir):
+    """download every day from 01/10/2005 to today from memeorandum.com"""
+    day = timedelta(days=1)
+    cur = date(2005, 10, 1)
+    end = date.today()
+    dates = []
+    while cur <= end:
+        if not (output_dir / f"{cur.strftime('%y-%m-%d')}.html").exists():
+            dates.append(cur)
+        cur = cur + day
+    date_iter = tqdm(dates, postfix="test")
+    for i in date_iter:
+        date_iter.set_postfix_str(f"{i}")
+        save_as = output_dir / f"{i.strftime('%y-%m-%d')}.html"
+        if save_as.exists():
+            continue
+        url = f"https://www.memeorandum.com/{i.strftime('%y%m%d')}/h2000"
+        r = requests.get(url)
+        with open(save_as, 'w') as f:
+            f.write(r.text)
+
+
+@click.command(name='scrape:parse')
+@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum", show_default=True)
+@click.option('-o', '--output_dir', type=Path, default=data_dir(), show_default=True)
+def parse(directory, output_dir):
+    """parse the html files on disk into a structured csv format."""
+    update_tld_names()
+    directory = data_dir() / "memeorandum"
+    parser = etree.HTMLParser()
+    pages = [f for f in directory.glob("*.html")]
+    published = []
+    others = []
+    # page = pages[0]
+    page_iter = tqdm(pages, postfix="starting")
+    for page in page_iter:
+        page_iter.set_postfix_str(f"{page}")
+        date = datetime.datetime.strptime(page.stem, '%y-%m-%d')
+        tree = etree.parse(str(page), parser)
+        root = tree.getroot()
+        if root is None:
+            print(f"error opening {page}")
+            continue
+        items = root.xpath("//div[contains(@class, 'item')]")
+
+        # item = items[0]
+        for item in items:
+            out = dict()
+            out['published_at'] = date
+            citation = item.xpath('./cite')
+            if not citation:
+                continue
+            author = citation[0]
+            if author.text: 
+                author = ''.join(author.text.split('/')[:-1]).strip()
+            else:
+                author = ''
+            out['author'] = author
+            try:
+                publisher_url = citation[0].getchildren()[0].get('href')
+                publisher = citation[0].getchildren()[0].text
+            except IndexError as e:
+                print(f"error with citation url: {page}")
+            out['publisher'] = publisher
+            out['publisher_url'] = publisher_url
+
+            title = item.xpath('.//strong/a')[0].text
+            out['title'] = title
+
+            url = item.xpath('.//strong/a')[0].get('href')
+            out['url'] = url
+            out['tld'] = get_tld(publisher_url)
+
+            item_id = hash((page.stem, url))
+            out['id'] = item_id
+
+            # old_id = hash((title, page.stem, publisher_url))
+            # out['old_id'] = old_id
+            published.append(out)
+
+            related = item.xpath(".//span[contains(@class, 'mls')]/a")
+            # relation = related[0]
+            for relation in related:
+                another = dict()
+                another['url'] = relation.get('href')
+                another['publisher'] = relation.text
+                another['parent_id'] = item_id
+                another['publisher_domain'] = urlparse(another['url']).netloc
+                others.append(another)
+    df = pd.DataFrame(published)
+    df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
+    df = pd.DataFrame(others)
+    df.to_csv(output_dir / 'related.csv', sep='|', index=False)
+
+@click.command(name='scrape:normalize')
+def normalize():
+    """fix database after load. remove duplicates. create publishers."""
+    DB = connect()
+
+    DB.sql("""
+        DELETE FROM stories
+        WHERE id IN (
+            WITH cte AS (
+                SELECT
+                    url
+                    ,id
+                    ,ROW_NUMBER() OVER(PARTITION BY url) AS url_ctn
+                    ,ROW_NUMBER() OVER(PARTITION BY title) AS title_ctn
+                FROM stories
+            )
+            SELECT
+                id
+            FROM cte
+            WHERE url_ctn > 1
+            OR title_ctn > 1
+        )
+    """)
+
+
+
+    DB.sql("""
+        CREATE OR REPLACE TABLE publishers AS
+        with cte as (
+            SELECT
+                s.publisher as name
+                ,s.publisher_url_domain as url
+            FROM stories s
+            GROUP BY 
+                s.publisher
+                ,s.publisher_url_domain
+        ), together AS (
+            SELECT
+                COALESCE(cte.name, r.publisher) AS name
+                ,COALESCE(cte.url, r.publisher_domain) as url
+            FROM cte
+            FULL OUTER JOIN related_stories r
+            ON cte.url = r.publisher_domain
+        )
+        SELECT
+            ROW_NUMBER() OVER() as id
+            ,t.name
+            ,t.url
+        FROM together t
+        where t.url is not null
+        GROUP BY
+            name
+            ,url
+    """)
+
+    DB.sql("""
+        alter table stories
+        add column publisher_id bigint
+    """)
+
+    DB.sql("""
+        update stories
+        set publisher_id = publishers.id
+        from publishers
+        where publishers.url = stories.publisher_url_domain
+    """)
+
+    DB.sql("""
+        alter table stories alter publisher_id set data type bigint
+    """)
+
+
+    DB.sql("""
+        alter table stories drop publisher;
+        alter table stories drop publisher_url;
+        alter table stories drop publisher_url_domain;
+        alter table stories drop domain;
+    """)
+
+    DB.sql("""
+        alter table related_stories
+        add column publisher_id bigint
+    """)
+
+
+    DB.sql("""
+        update related_stories
+        set publisher_id = publishers.id
+        from publishers
+        where publishers.url = related_stories.publisher_domain
+    """)
+
+    DB.sql("""
+        alter table related_stories drop publisher;
+        alter table related_stories drop publisher_domain;
+    """)
+
+
+def another_norm():
+    sv2 = pd.read_csv(data_dir / 'stories.csv', sep="|")
+    related = pd.read_csv(data_dir / 'related.csv', sep="|")
+
+    related['tld'] = related.url.apply(lambda x: map_tld(x))
+
+    DB.query("""
+        update related_stories
+        set publisher_id = p.id
+        from publishers p
+        join related r
+        on r.tld = p.tld
+        where r.url = related_stories.url
+    """)
+
+
+    DB.query("""alter table stories add column tld text""")
+
+    s_url = DB.query("""
+    select
+        id
+        ,url
+        from stories
+    """).df()
+
+
+    s_url['tld'] = s_url.url.apply(lambda x: map_tld(x))
+
+    DB.query("""
+        update stories
+        set tld = s_url.tld
+        from s_url
+        where s_url.id = stories.id
+    """)
+
+    DB.query("""
+        update stories
+        set publisher_id = p.id
+        from publishers p
+        where p.tld = stories.tld
+    """)
+
+
+    select
+    DB.query("""
+        update stories
+        set stories.publisher_id = p.id
+        from new_pub
+    """)
+    sv2['tld'] = sv2.publisher_url.apply(lambda x: map_tld(x))
+
+
+    new_pub = DB.query("""
+        with cte as (
+            select
+                tld
+                ,publisher
+                ,count(1) filter(where year(published_at) = 2022) as recent_ctn
+                ,count(1) as ctn
+            from sv2
+            group by
+                tld
+                ,publisher
+        )
+        ,r as (
+        select
+            tld
+            ,publisher
+            ,ctn
+            ,row_number() over(partition by tld order by recent_ctn desc) as rn
+        from cte
+        )
+        select
+            row_number() over() as id
+            ,publisher as name
+            ,tld
+        from r
+        where rn = 1
+        order by ctn desc
+    """).df()
+
+    DB.query("""
+        CREATE OR REPLACE TABLE publishers AS
+        SELECT
+            id
+            ,name
+            ,tld
+        FROM new_pub
+    """)
+
+
+    def map_tld(x):
+        try:
+            res = get_tld(x, as_object=True)
+            return res.fld
+        except:
+            return None
+
+    DB.sql("""
+        SELECT
+            s.id
+            ,sv2.publisher_url
+        FROM stories s
+        JOIN sv2
+        on sv2.id = s.id
+        limit 5
+    """)