v1.0 of presentation.
This commit is contained in:
6
src/data/__init__.py
Normal file
6
src/data/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
import data.main
|
||||
import data.scrape
|
||||
__all__ = [
|
||||
'main'
|
||||
,'scrape'
|
||||
]
|
||||
30
src/data/main.py
Normal file
30
src/data/main.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
import duckdb
|
||||
from enum import Enum
|
||||
|
||||
class Data(str, Enum):
|
||||
Titles = 'titles'
|
||||
|
||||
def data_dir():
|
||||
return Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
|
||||
def connect():
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
# APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
|
||||
DB = duckdb.connect(str(DATA_DIR / 'project.duckdb'))
|
||||
return DB
|
||||
|
||||
def from_db(t: Data):
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
# APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
|
||||
DB = duckdb.connect(str(DATA_DIR / 'project.duckdb'))
|
||||
if t == Data.Titles:
|
||||
table = DB.sql("""
|
||||
select
|
||||
distinct
|
||||
title
|
||||
from stories
|
||||
limit 100
|
||||
""").df()
|
||||
return table
|
||||
337
src/data/scrape.py
Normal file
337
src/data/scrape.py
Normal file
@@ -0,0 +1,337 @@
|
||||
from datetime import date, timedelta
|
||||
import datetime
|
||||
import requests
|
||||
from pathlib import Path
|
||||
import click
|
||||
from tqdm import tqdm
|
||||
from data.main import data_dir, connect
|
||||
from lxml import etree
|
||||
import pandas as pd
|
||||
from urllib.parse import urlparse
|
||||
from tld import get_tld
|
||||
from tld.utils import update_tld_names
|
||||
|
||||
@click.command(name='scrape:load')
|
||||
@click.option('--directory', type=Path, default=data_dir(), show_default=True)
|
||||
@click.option('--database', type=Path, default=data_dir() / "stories.duckdb", show_default=True)
|
||||
def load(directory, database):
|
||||
stories = directory / "stories.csv"
|
||||
related = directory / "related.csv"
|
||||
db = connect()
|
||||
|
||||
db.sql(f"""
|
||||
CREATE OR REPLACE TABLE stories AS
|
||||
SELECT
|
||||
*
|
||||
FROM read_csv_auto('{stories}')
|
||||
""")
|
||||
|
||||
db.sql(f"""
|
||||
CREATE OR REPLACE TABLE related_stories AS
|
||||
SELECT
|
||||
*
|
||||
FROM read_csv_auto('{related}')
|
||||
""")
|
||||
db.close()
|
||||
|
||||
@click.command(name='scrape:download')
|
||||
@click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum", show_default=True)
|
||||
def download(output_dir):
|
||||
"""download every day from 01/10/2005 to today from memeorandum.com"""
|
||||
day = timedelta(days=1)
|
||||
cur = date(2005, 10, 1)
|
||||
end = date.today()
|
||||
dates = []
|
||||
while cur <= end:
|
||||
if not (output_dir / f"{cur.strftime('%y-%m-%d')}.html").exists():
|
||||
dates.append(cur)
|
||||
cur = cur + day
|
||||
date_iter = tqdm(dates, postfix="test")
|
||||
for i in date_iter:
|
||||
date_iter.set_postfix_str(f"{i}")
|
||||
save_as = output_dir / f"{i.strftime('%y-%m-%d')}.html"
|
||||
if save_as.exists():
|
||||
continue
|
||||
url = f"https://www.memeorandum.com/{i.strftime('%y%m%d')}/h2000"
|
||||
r = requests.get(url)
|
||||
with open(save_as, 'w') as f:
|
||||
f.write(r.text)
|
||||
|
||||
|
||||
@click.command(name='scrape:parse')
|
||||
@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum", show_default=True)
|
||||
@click.option('-o', '--output_dir', type=Path, default=data_dir(), show_default=True)
|
||||
def parse(directory, output_dir):
|
||||
"""parse the html files on disk into a structured csv format."""
|
||||
update_tld_names()
|
||||
directory = data_dir() / "memeorandum"
|
||||
parser = etree.HTMLParser()
|
||||
pages = [f for f in directory.glob("*.html")]
|
||||
published = []
|
||||
others = []
|
||||
# page = pages[0]
|
||||
page_iter = tqdm(pages, postfix="starting")
|
||||
for page in page_iter:
|
||||
page_iter.set_postfix_str(f"{page}")
|
||||
date = datetime.datetime.strptime(page.stem, '%y-%m-%d')
|
||||
tree = etree.parse(str(page), parser)
|
||||
root = tree.getroot()
|
||||
if root is None:
|
||||
print(f"error opening {page}")
|
||||
continue
|
||||
items = root.xpath("//div[contains(@class, 'item')]")
|
||||
|
||||
# item = items[0]
|
||||
for item in items:
|
||||
out = dict()
|
||||
out['published_at'] = date
|
||||
citation = item.xpath('./cite')
|
||||
if not citation:
|
||||
continue
|
||||
author = citation[0]
|
||||
if author.text:
|
||||
author = ''.join(author.text.split('/')[:-1]).strip()
|
||||
else:
|
||||
author = ''
|
||||
out['author'] = author
|
||||
try:
|
||||
publisher_url = citation[0].getchildren()[0].get('href')
|
||||
publisher = citation[0].getchildren()[0].text
|
||||
except IndexError as e:
|
||||
print(f"error with citation url: {page}")
|
||||
out['publisher'] = publisher
|
||||
out['publisher_url'] = publisher_url
|
||||
|
||||
title = item.xpath('.//strong/a')[0].text
|
||||
out['title'] = title
|
||||
|
||||
url = item.xpath('.//strong/a')[0].get('href')
|
||||
out['url'] = url
|
||||
out['tld'] = get_tld(publisher_url)
|
||||
|
||||
item_id = hash((page.stem, url))
|
||||
out['id'] = item_id
|
||||
|
||||
# old_id = hash((title, page.stem, publisher_url))
|
||||
# out['old_id'] = old_id
|
||||
published.append(out)
|
||||
|
||||
related = item.xpath(".//span[contains(@class, 'mls')]/a")
|
||||
# relation = related[0]
|
||||
for relation in related:
|
||||
another = dict()
|
||||
another['url'] = relation.get('href')
|
||||
another['publisher'] = relation.text
|
||||
another['parent_id'] = item_id
|
||||
another['publisher_domain'] = urlparse(another['url']).netloc
|
||||
others.append(another)
|
||||
df = pd.DataFrame(published)
|
||||
df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
|
||||
df = pd.DataFrame(others)
|
||||
df.to_csv(output_dir / 'related.csv', sep='|', index=False)
|
||||
|
||||
@click.command(name='scrape:normalize')
|
||||
def normalize():
|
||||
"""fix database after load. remove duplicates. create publishers."""
|
||||
DB = connect()
|
||||
|
||||
DB.sql("""
|
||||
DELETE FROM stories
|
||||
WHERE id IN (
|
||||
WITH cte AS (
|
||||
SELECT
|
||||
url
|
||||
,id
|
||||
,ROW_NUMBER() OVER(PARTITION BY url) AS url_ctn
|
||||
,ROW_NUMBER() OVER(PARTITION BY title) AS title_ctn
|
||||
FROM stories
|
||||
)
|
||||
SELECT
|
||||
id
|
||||
FROM cte
|
||||
WHERE url_ctn > 1
|
||||
OR title_ctn > 1
|
||||
)
|
||||
""")
|
||||
|
||||
|
||||
|
||||
DB.sql("""
|
||||
CREATE OR REPLACE TABLE publishers AS
|
||||
with cte as (
|
||||
SELECT
|
||||
s.publisher as name
|
||||
,s.publisher_url_domain as url
|
||||
FROM stories s
|
||||
GROUP BY
|
||||
s.publisher
|
||||
,s.publisher_url_domain
|
||||
), together AS (
|
||||
SELECT
|
||||
COALESCE(cte.name, r.publisher) AS name
|
||||
,COALESCE(cte.url, r.publisher_domain) as url
|
||||
FROM cte
|
||||
FULL OUTER JOIN related_stories r
|
||||
ON cte.url = r.publisher_domain
|
||||
)
|
||||
SELECT
|
||||
ROW_NUMBER() OVER() as id
|
||||
,t.name
|
||||
,t.url
|
||||
FROM together t
|
||||
where t.url is not null
|
||||
GROUP BY
|
||||
name
|
||||
,url
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
alter table stories
|
||||
add column publisher_id bigint
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
update stories
|
||||
set publisher_id = publishers.id
|
||||
from publishers
|
||||
where publishers.url = stories.publisher_url_domain
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
alter table stories alter publisher_id set data type bigint
|
||||
""")
|
||||
|
||||
|
||||
DB.sql("""
|
||||
alter table stories drop publisher;
|
||||
alter table stories drop publisher_url;
|
||||
alter table stories drop publisher_url_domain;
|
||||
alter table stories drop domain;
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
alter table related_stories
|
||||
add column publisher_id bigint
|
||||
""")
|
||||
|
||||
|
||||
DB.sql("""
|
||||
update related_stories
|
||||
set publisher_id = publishers.id
|
||||
from publishers
|
||||
where publishers.url = related_stories.publisher_domain
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
alter table related_stories drop publisher;
|
||||
alter table related_stories drop publisher_domain;
|
||||
""")
|
||||
|
||||
|
||||
def another_norm():
|
||||
sv2 = pd.read_csv(data_dir / 'stories.csv', sep="|")
|
||||
related = pd.read_csv(data_dir / 'related.csv', sep="|")
|
||||
|
||||
related['tld'] = related.url.apply(lambda x: map_tld(x))
|
||||
|
||||
DB.query("""
|
||||
update related_stories
|
||||
set publisher_id = p.id
|
||||
from publishers p
|
||||
join related r
|
||||
on r.tld = p.tld
|
||||
where r.url = related_stories.url
|
||||
""")
|
||||
|
||||
|
||||
DB.query("""alter table stories add column tld text""")
|
||||
|
||||
s_url = DB.query("""
|
||||
select
|
||||
id
|
||||
,url
|
||||
from stories
|
||||
""").df()
|
||||
|
||||
|
||||
s_url['tld'] = s_url.url.apply(lambda x: map_tld(x))
|
||||
|
||||
DB.query("""
|
||||
update stories
|
||||
set tld = s_url.tld
|
||||
from s_url
|
||||
where s_url.id = stories.id
|
||||
""")
|
||||
|
||||
DB.query("""
|
||||
update stories
|
||||
set publisher_id = p.id
|
||||
from publishers p
|
||||
where p.tld = stories.tld
|
||||
""")
|
||||
|
||||
|
||||
select
|
||||
DB.query("""
|
||||
update stories
|
||||
set stories.publisher_id = p.id
|
||||
from new_pub
|
||||
""")
|
||||
sv2['tld'] = sv2.publisher_url.apply(lambda x: map_tld(x))
|
||||
|
||||
|
||||
new_pub = DB.query("""
|
||||
with cte as (
|
||||
select
|
||||
tld
|
||||
,publisher
|
||||
,count(1) filter(where year(published_at) = 2022) as recent_ctn
|
||||
,count(1) as ctn
|
||||
from sv2
|
||||
group by
|
||||
tld
|
||||
,publisher
|
||||
)
|
||||
,r as (
|
||||
select
|
||||
tld
|
||||
,publisher
|
||||
,ctn
|
||||
,row_number() over(partition by tld order by recent_ctn desc) as rn
|
||||
from cte
|
||||
)
|
||||
select
|
||||
row_number() over() as id
|
||||
,publisher as name
|
||||
,tld
|
||||
from r
|
||||
where rn = 1
|
||||
order by ctn desc
|
||||
""").df()
|
||||
|
||||
DB.query("""
|
||||
CREATE OR REPLACE TABLE publishers AS
|
||||
SELECT
|
||||
id
|
||||
,name
|
||||
,tld
|
||||
FROM new_pub
|
||||
""")
|
||||
|
||||
|
||||
def map_tld(x):
|
||||
try:
|
||||
res = get_tld(x, as_object=True)
|
||||
return res.fld
|
||||
except:
|
||||
return None
|
||||
|
||||
DB.sql("""
|
||||
SELECT
|
||||
s.id
|
||||
,sv2.publisher_url
|
||||
FROM stories s
|
||||
JOIN sv2
|
||||
on sv2.id = s.id
|
||||
limit 5
|
||||
""")
|
||||
Reference in New Issue
Block a user