v1.0 of presentation.

This commit is contained in:
matt
2023-05-17 13:38:07 -07:00
parent 4d93cf7adb
commit 74c2d8afa2
37 changed files with 1959 additions and 144 deletions

6
src/data/__init__.py Normal file
View File

@@ -0,0 +1,6 @@
import data.main
import data.scrape
__all__ = [
'main'
,'scrape'
]

30
src/data/main.py Normal file
View File

@@ -0,0 +1,30 @@
import os
from pathlib import Path
import duckdb
from enum import Enum
class Data(str, Enum):
Titles = 'titles'
def data_dir():
return Path(os.environ['DATA_MINING_DATA_DIR'])
def connect():
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
# APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
DB = duckdb.connect(str(DATA_DIR / 'project.duckdb'))
return DB
def from_db(t: Data):
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
# APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
DB = duckdb.connect(str(DATA_DIR / 'project.duckdb'))
if t == Data.Titles:
table = DB.sql("""
select
distinct
title
from stories
limit 100
""").df()
return table

337
src/data/scrape.py Normal file
View File

@@ -0,0 +1,337 @@
from datetime import date, timedelta
import datetime
import requests
from pathlib import Path
import click
from tqdm import tqdm
from data.main import data_dir, connect
from lxml import etree
import pandas as pd
from urllib.parse import urlparse
from tld import get_tld
from tld.utils import update_tld_names
@click.command(name='scrape:load')
@click.option('--directory', type=Path, default=data_dir(), show_default=True)
@click.option('--database', type=Path, default=data_dir() / "stories.duckdb", show_default=True)
def load(directory, database):
stories = directory / "stories.csv"
related = directory / "related.csv"
db = connect()
db.sql(f"""
CREATE OR REPLACE TABLE stories AS
SELECT
*
FROM read_csv_auto('{stories}')
""")
db.sql(f"""
CREATE OR REPLACE TABLE related_stories AS
SELECT
*
FROM read_csv_auto('{related}')
""")
db.close()
@click.command(name='scrape:download')
@click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum", show_default=True)
def download(output_dir):
"""download every day from 01/10/2005 to today from memeorandum.com"""
day = timedelta(days=1)
cur = date(2005, 10, 1)
end = date.today()
dates = []
while cur <= end:
if not (output_dir / f"{cur.strftime('%y-%m-%d')}.html").exists():
dates.append(cur)
cur = cur + day
date_iter = tqdm(dates, postfix="test")
for i in date_iter:
date_iter.set_postfix_str(f"{i}")
save_as = output_dir / f"{i.strftime('%y-%m-%d')}.html"
if save_as.exists():
continue
url = f"https://www.memeorandum.com/{i.strftime('%y%m%d')}/h2000"
r = requests.get(url)
with open(save_as, 'w') as f:
f.write(r.text)
@click.command(name='scrape:parse')
@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum", show_default=True)
@click.option('-o', '--output_dir', type=Path, default=data_dir(), show_default=True)
def parse(directory, output_dir):
"""parse the html files on disk into a structured csv format."""
update_tld_names()
directory = data_dir() / "memeorandum"
parser = etree.HTMLParser()
pages = [f for f in directory.glob("*.html")]
published = []
others = []
# page = pages[0]
page_iter = tqdm(pages, postfix="starting")
for page in page_iter:
page_iter.set_postfix_str(f"{page}")
date = datetime.datetime.strptime(page.stem, '%y-%m-%d')
tree = etree.parse(str(page), parser)
root = tree.getroot()
if root is None:
print(f"error opening {page}")
continue
items = root.xpath("//div[contains(@class, 'item')]")
# item = items[0]
for item in items:
out = dict()
out['published_at'] = date
citation = item.xpath('./cite')
if not citation:
continue
author = citation[0]
if author.text:
author = ''.join(author.text.split('/')[:-1]).strip()
else:
author = ''
out['author'] = author
try:
publisher_url = citation[0].getchildren()[0].get('href')
publisher = citation[0].getchildren()[0].text
except IndexError as e:
print(f"error with citation url: {page}")
out['publisher'] = publisher
out['publisher_url'] = publisher_url
title = item.xpath('.//strong/a')[0].text
out['title'] = title
url = item.xpath('.//strong/a')[0].get('href')
out['url'] = url
out['tld'] = get_tld(publisher_url)
item_id = hash((page.stem, url))
out['id'] = item_id
# old_id = hash((title, page.stem, publisher_url))
# out['old_id'] = old_id
published.append(out)
related = item.xpath(".//span[contains(@class, 'mls')]/a")
# relation = related[0]
for relation in related:
another = dict()
another['url'] = relation.get('href')
another['publisher'] = relation.text
another['parent_id'] = item_id
another['publisher_domain'] = urlparse(another['url']).netloc
others.append(another)
df = pd.DataFrame(published)
df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
df = pd.DataFrame(others)
df.to_csv(output_dir / 'related.csv', sep='|', index=False)
@click.command(name='scrape:normalize')
def normalize():
"""fix database after load. remove duplicates. create publishers."""
DB = connect()
DB.sql("""
DELETE FROM stories
WHERE id IN (
WITH cte AS (
SELECT
url
,id
,ROW_NUMBER() OVER(PARTITION BY url) AS url_ctn
,ROW_NUMBER() OVER(PARTITION BY title) AS title_ctn
FROM stories
)
SELECT
id
FROM cte
WHERE url_ctn > 1
OR title_ctn > 1
)
""")
DB.sql("""
CREATE OR REPLACE TABLE publishers AS
with cte as (
SELECT
s.publisher as name
,s.publisher_url_domain as url
FROM stories s
GROUP BY
s.publisher
,s.publisher_url_domain
), together AS (
SELECT
COALESCE(cte.name, r.publisher) AS name
,COALESCE(cte.url, r.publisher_domain) as url
FROM cte
FULL OUTER JOIN related_stories r
ON cte.url = r.publisher_domain
)
SELECT
ROW_NUMBER() OVER() as id
,t.name
,t.url
FROM together t
where t.url is not null
GROUP BY
name
,url
""")
DB.sql("""
alter table stories
add column publisher_id bigint
""")
DB.sql("""
update stories
set publisher_id = publishers.id
from publishers
where publishers.url = stories.publisher_url_domain
""")
DB.sql("""
alter table stories alter publisher_id set data type bigint
""")
DB.sql("""
alter table stories drop publisher;
alter table stories drop publisher_url;
alter table stories drop publisher_url_domain;
alter table stories drop domain;
""")
DB.sql("""
alter table related_stories
add column publisher_id bigint
""")
DB.sql("""
update related_stories
set publisher_id = publishers.id
from publishers
where publishers.url = related_stories.publisher_domain
""")
DB.sql("""
alter table related_stories drop publisher;
alter table related_stories drop publisher_domain;
""")
def another_norm():
sv2 = pd.read_csv(data_dir / 'stories.csv', sep="|")
related = pd.read_csv(data_dir / 'related.csv', sep="|")
related['tld'] = related.url.apply(lambda x: map_tld(x))
DB.query("""
update related_stories
set publisher_id = p.id
from publishers p
join related r
on r.tld = p.tld
where r.url = related_stories.url
""")
DB.query("""alter table stories add column tld text""")
s_url = DB.query("""
select
id
,url
from stories
""").df()
s_url['tld'] = s_url.url.apply(lambda x: map_tld(x))
DB.query("""
update stories
set tld = s_url.tld
from s_url
where s_url.id = stories.id
""")
DB.query("""
update stories
set publisher_id = p.id
from publishers p
where p.tld = stories.tld
""")
select
DB.query("""
update stories
set stories.publisher_id = p.id
from new_pub
""")
sv2['tld'] = sv2.publisher_url.apply(lambda x: map_tld(x))
new_pub = DB.query("""
with cte as (
select
tld
,publisher
,count(1) filter(where year(published_at) = 2022) as recent_ctn
,count(1) as ctn
from sv2
group by
tld
,publisher
)
,r as (
select
tld
,publisher
,ctn
,row_number() over(partition by tld order by recent_ctn desc) as rn
from cte
)
select
row_number() over() as id
,publisher as name
,tld
from r
where rn = 1
order by ctn desc
""").df()
DB.query("""
CREATE OR REPLACE TABLE publishers AS
SELECT
id
,name
,tld
FROM new_pub
""")
def map_tld(x):
try:
res = get_tld(x, as_object=True)
return res.fld
except:
return None
DB.sql("""
SELECT
s.id
,sv2.publisher_url
FROM stories s
JOIN sv2
on sv2.id = s.id
limit 5
""")