from datetime import date, timedelta import datetime import requests from pathlib import Path import click from tqdm import tqdm from data.main import data_dir, connect from lxml import etree import pandas as pd from urllib.parse import urlparse from tld import get_tld from tld.utils import update_tld_names @click.command(name='scrape:load') @click.option('--directory', type=Path, default=data_dir(), show_default=True) @click.option('--database', type=Path, default=data_dir() / "stories.duckdb", show_default=True) def load(directory, database): stories = directory / "stories.csv" related = directory / "related.csv" db = connect() db.sql(f""" CREATE OR REPLACE TABLE stories AS SELECT * FROM read_csv_auto('{stories}') """) db.sql(f""" CREATE OR REPLACE TABLE related_stories AS SELECT * FROM read_csv_auto('{related}') """) db.close() @click.command(name='scrape:download') @click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum", show_default=True) def download(output_dir): """download every day from 01/10/2005 to today from memeorandum.com""" day = timedelta(days=1) cur = date(2005, 10, 1) end = date.today() dates = [] while cur <= end: if not (output_dir / f"{cur.strftime('%y-%m-%d')}.html").exists(): dates.append(cur) cur = cur + day date_iter = tqdm(dates, postfix="test") for i in date_iter: date_iter.set_postfix_str(f"{i}") save_as = output_dir / f"{i.strftime('%y-%m-%d')}.html" if save_as.exists(): continue url = f"https://www.memeorandum.com/{i.strftime('%y%m%d')}/h2000" r = requests.get(url) with open(save_as, 'w') as f: f.write(r.text) @click.command(name='scrape:parse') @click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum", show_default=True) @click.option('-o', '--output_dir', type=Path, default=data_dir(), show_default=True) def parse(directory, output_dir): """parse the html files on disk into a structured csv format.""" update_tld_names() directory = data_dir() / "memeorandum" parser = etree.HTMLParser() pages = [f for f in directory.glob("*.html")] published = [] others = [] # page = pages[0] page_iter = tqdm(pages, postfix="starting") for page in page_iter: page_iter.set_postfix_str(f"{page}") date = datetime.datetime.strptime(page.stem, '%y-%m-%d') tree = etree.parse(str(page), parser) root = tree.getroot() if root is None: print(f"error opening {page}") continue items = root.xpath("//div[contains(@class, 'item')]") # item = items[0] for item in items: out = dict() out['published_at'] = date citation = item.xpath('./cite') if not citation: continue author = citation[0] if author.text: author = ''.join(author.text.split('/')[:-1]).strip() else: author = '' out['author'] = author try: publisher_url = citation[0].getchildren()[0].get('href') publisher = citation[0].getchildren()[0].text except IndexError as e: print(f"error with citation url: {page}") out['publisher'] = publisher out['publisher_url'] = publisher_url title = item.xpath('.//strong/a')[0].text out['title'] = title url = item.xpath('.//strong/a')[0].get('href') out['url'] = url out['tld'] = get_tld(publisher_url) item_id = hash((page.stem, url)) out['id'] = item_id # old_id = hash((title, page.stem, publisher_url)) # out['old_id'] = old_id published.append(out) related = item.xpath(".//span[contains(@class, 'mls')]/a") # relation = related[0] for relation in related: another = dict() another['url'] = relation.get('href') another['publisher'] = relation.text another['parent_id'] = item_id another['publisher_domain'] = urlparse(another['url']).netloc others.append(another) df = pd.DataFrame(published) df.to_csv(output_dir / 'stories.csv', sep='|', index=False) df = pd.DataFrame(others) df.to_csv(output_dir / 'related.csv', sep='|', index=False) @click.command(name='scrape:normalize') def normalize(): """fix database after load. remove duplicates. create publishers.""" DB = connect() DB.sql(""" DELETE FROM stories WHERE id IN ( WITH cte AS ( SELECT url ,id ,ROW_NUMBER() OVER(PARTITION BY url) AS url_ctn ,ROW_NUMBER() OVER(PARTITION BY title) AS title_ctn FROM stories ) SELECT id FROM cte WHERE url_ctn > 1 OR title_ctn > 1 ) """) DB.sql(""" CREATE OR REPLACE TABLE publishers AS with cte as ( SELECT s.publisher as name ,s.publisher_url_domain as url FROM stories s GROUP BY s.publisher ,s.publisher_url_domain ), together AS ( SELECT COALESCE(cte.name, r.publisher) AS name ,COALESCE(cte.url, r.publisher_domain) as url FROM cte FULL OUTER JOIN related_stories r ON cte.url = r.publisher_domain ) SELECT ROW_NUMBER() OVER() as id ,t.name ,t.url FROM together t where t.url is not null GROUP BY name ,url """) DB.sql(""" alter table stories add column publisher_id bigint """) DB.sql(""" update stories set publisher_id = publishers.id from publishers where publishers.url = stories.publisher_url_domain """) DB.sql(""" alter table stories alter publisher_id set data type bigint """) DB.sql(""" alter table stories drop publisher; alter table stories drop publisher_url; alter table stories drop publisher_url_domain; alter table stories drop domain; """) DB.sql(""" alter table related_stories add column publisher_id bigint """) DB.sql(""" update related_stories set publisher_id = publishers.id from publishers where publishers.url = related_stories.publisher_domain """) DB.sql(""" alter table related_stories drop publisher; alter table related_stories drop publisher_domain; """) def another_norm(): sv2 = pd.read_csv(data_dir / 'stories.csv', sep="|") related = pd.read_csv(data_dir / 'related.csv', sep="|") related['tld'] = related.url.apply(lambda x: map_tld(x)) DB.query(""" update related_stories set publisher_id = p.id from publishers p join related r on r.tld = p.tld where r.url = related_stories.url """) DB.query("""alter table stories add column tld text""") s_url = DB.query(""" select id ,url from stories """).df() s_url['tld'] = s_url.url.apply(lambda x: map_tld(x)) DB.query(""" update stories set tld = s_url.tld from s_url where s_url.id = stories.id """) DB.query(""" update stories set publisher_id = p.id from publishers p where p.tld = stories.tld """) select DB.query(""" update stories set stories.publisher_id = p.id from new_pub """) sv2['tld'] = sv2.publisher_url.apply(lambda x: map_tld(x)) new_pub = DB.query(""" with cte as ( select tld ,publisher ,count(1) filter(where year(published_at) = 2022) as recent_ctn ,count(1) as ctn from sv2 group by tld ,publisher ) ,r as ( select tld ,publisher ,ctn ,row_number() over(partition by tld order by recent_ctn desc) as rn from cte ) select row_number() over() as id ,publisher as name ,tld from r where rn = 1 order by ctn desc """).df() DB.query(""" CREATE OR REPLACE TABLE publishers AS SELECT id ,name ,tld FROM new_pub """) DB.sql(""" SELECT s.id ,sv2.publisher_url FROM stories s JOIN sv2 on sv2.id = s.id limit 5 """) @click.command('data:create-election-table') def create_elections_table(): df = pd.read_csv(data_dir() / 'election_dates.csv', sep="|") df['date'] = pd.to_datetime(df.date) DB = connect() DB.query(""" CREATE OR REPLACE TABLE election_dates AS SELECT row_number() over() as id ,type ,date ,winner FROM df """) DB.query(""" CREATE OR REPLACE TABLE election_distance AS WITH cte as ( SELECT day(e.date - s.published_at) as days_away ,e.id as election_id ,e.date as election_date ,s.published_at as publish_date ,e.winner as winner FROM ( SELECT DISTINCT published_at FROM top.stories ) s CROSS JOIN election_dates e ) , windowed as ( SELECT row_number() over(partition by publish_date order by abs(days_away) asc) as rn ,days_away ,publish_date ,election_date ,election_id ,winner FROM cte ) SELECT days_away ,publish_date ,election_date ,election_id ,winner FROM windowed WHERE rn = 1 """) DB.close() @click.command('scrape:create-denorm') def create_denorm(): DB = connect() DB.sql("create schema denorm") DB.sql(""" CREATE OR REPLACE TABLE denorm.stories AS SELECT s.id as story_id ,s.title ,s.url ,s.published_at ,s.author ,p.name as publisher ,p.tld as tld ,sent.class_id as sentiment ,d.days_away as election_distance ,b.ordinal as bias ,pca.first as link_1 ,pca.second as link_2 ,e.emotion_id as emotion FROM top.stories s JOIN top.publishers p ON p.id = s.publisher_id JOIN top.story_sentiments sent ON s.id = sent.story_id JOIN election_distance d ON d.election_date = s.published_at JOIN publisher_bias pb ON pb.publisher_id = p.id JOIN bias_ratings b ON b.id = pb.bias_id JOIN top.publisher_pca_onehot pca ON pca.publisher_id = p.id JOIN story_emotions e ON e.story_id = s.id """) DB.close()