add progress report.

add scraping for downloading and parsing.
add joining of bias dataset.
add broken links checker.
This commit is contained in:
matt
2023-04-11 21:42:05 -07:00
parent b9c63414a0
commit feb3a4b8ed
9 changed files with 276 additions and 0 deletions

23
src/broken_links.py Normal file
View File

@@ -0,0 +1,23 @@
import requests
import click
from data import connect
import seaborn as sns
import matplotlib.pyplot as plt
DB = connect()
DB.sql("""
with cte as (
select
count(1) as cnt
from stories
group by url, outlet
)
select
cast(sum(cnt) filter (where cnt = 1) as float)
/ sum(cnt) filter (where cnt > 1) as dups
from cte
""")
sns.histplot(x=hist['cnt'])
plt.show()

View File

@@ -6,6 +6,15 @@ from enum import Enum
class Data(str, Enum):
Titles = 'titles'
def data_dir():
return Path(os.environ['DATA_MINING_DATA_DIR'])
def connect():
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
# APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
DB = duckdb.connect(str(DATA_DIR / 'project.duckdb'))
return DB
def from_db(t: Data):
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
# APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])

46
src/join_bias.py Normal file
View File

@@ -0,0 +1,46 @@
import click
import duckdb
from data import connect
import polars as ps
DB = connect()
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
bias = ps.read_csv(DATA_DIR / 'allsides_bias.csv', sep="|")
DB.sql("""
with cte as (
select
outlet
,count(1) as stories
from stories
group by outlet
)
,total as (
select
sum(stories) as total
from cte
)
select
cte.outlet
,cte.stories
,bias.outlet
,bias.lean
,sum(100 * (cte.stories / cast(total.total as float))) over() as rep
,total.total
from cte
join bias
on jaro_winkler_similarity(bias.outlet, cte.outlet) > 0.9
cross join total.total
""")
DB.sql("""
select
outlet
,count(1) as stories
from stories
group by outlet
order by count(1) desc
limit 50
""")
outlets

92
src/scrape.py Normal file
View File

@@ -0,0 +1,92 @@
from datetime import date, timedelta
import datetime
import requests
from pathlib import Path
import click
from tqdm import tqdm
from data import data_dir
from lxml import etree
import pandas as pd
@click.group()
def cli():
...
@cli.command()
@click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum")
def download(output_dir):
day = timedelta(days=1)
cur = date(2005, 10, 1)
end = date.today()
dates = []
while cur <= end:
dates.append(cur)
cur = cur + day
date_iter = tqdm(dates, postfix="test")
for i in date_iter:
date_iter.set_postfix_str(f"{i}")
save_as = output_dir / f"{i.strftime('%y-%m-%d')}.html"
if save_as.exists():
continue
url = f"https://www.memeorandum.com/{i.strftime('%y%m%d')}/h2000"
r = requests.get(url)
with open(save_as, 'w') as f:
f.write(r.text)
@cli.command()
@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum")
@click.option('-o', '--output_dir', type=Path, default=data_dir())
def parse(directory, output_dir):
directory = data_dir() / "memeorandum"
parser = etree.HTMLParser()
pages = [f for f in directory.glob("*.html")]
published = []
others = []
#page = pages[0]
page_iter = tqdm(pages, postfix="starting")
for page in page_iter:
page_iter.set_postfix_str(f"{page}")
date = datetime.datetime.strptime(page.stem, '%y-%m-%d')
# tree = etree.parse(str(page), parser)
tree = etree.parse(str(page), parser)
root = tree.getroot()
items = root.xpath("//div[contains(@class, 'item')]")
for item in items:
out = dict()
citation = item.xpath('./cite')
if not citation:
continue
author = citation[0]
if author.text:
author = ''.join(author.text.split('/')[:-1]).strip()
else:
author = ''
out['author'] = author
url = citation[0].getchildren()[0].get('href')
publisher = citation[0].getchildren()[0].text
out['publisher'] = publisher
out['publisher_url'] = url
title = item.xpath('.//strong/a')[0].text
out['title'] = title
item_id = hash((title,page.stem,url))
out['id'] = item_id
published.append(out)
related = item.xpath(".//span[contains(@class, 'mls')]/a")
# relation = related[0]
for relation in related:
another = dict()
another['url'] = relation.get('href')
another['publisher'] = relation.text
another['parent_id'] = item_id
others.append(another)
df = pd.DataFrame(published)
df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
df = pd.DataFrame(others)
df.to_csv(output_dir / 'related.csv', sep='|', index=False)
if __name__ == "__main__":
cli()