add progress report.
add scraping for downloading and parsing. add joining of bias dataset. add broken links checker.
This commit is contained in:
23
src/broken_links.py
Normal file
23
src/broken_links.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import requests
|
||||
import click
|
||||
from data import connect
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
DB = connect()
|
||||
|
||||
DB.sql("""
|
||||
with cte as (
|
||||
select
|
||||
count(1) as cnt
|
||||
from stories
|
||||
group by url, outlet
|
||||
)
|
||||
select
|
||||
cast(sum(cnt) filter (where cnt = 1) as float)
|
||||
/ sum(cnt) filter (where cnt > 1) as dups
|
||||
from cte
|
||||
""")
|
||||
|
||||
sns.histplot(x=hist['cnt'])
|
||||
plt.show()
|
||||
@@ -6,6 +6,15 @@ from enum import Enum
|
||||
class Data(str, Enum):
|
||||
Titles = 'titles'
|
||||
|
||||
def data_dir():
|
||||
return Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
|
||||
def connect():
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
# APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
|
||||
DB = duckdb.connect(str(DATA_DIR / 'project.duckdb'))
|
||||
return DB
|
||||
|
||||
def from_db(t: Data):
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
# APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
|
||||
|
||||
46
src/join_bias.py
Normal file
46
src/join_bias.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import click
|
||||
import duckdb
|
||||
from data import connect
|
||||
import polars as ps
|
||||
|
||||
DB = connect()
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
bias = ps.read_csv(DATA_DIR / 'allsides_bias.csv', sep="|")
|
||||
|
||||
DB.sql("""
|
||||
with cte as (
|
||||
select
|
||||
outlet
|
||||
,count(1) as stories
|
||||
from stories
|
||||
group by outlet
|
||||
)
|
||||
,total as (
|
||||
select
|
||||
sum(stories) as total
|
||||
from cte
|
||||
)
|
||||
select
|
||||
cte.outlet
|
||||
,cte.stories
|
||||
,bias.outlet
|
||||
,bias.lean
|
||||
,sum(100 * (cte.stories / cast(total.total as float))) over() as rep
|
||||
,total.total
|
||||
from cte
|
||||
join bias
|
||||
on jaro_winkler_similarity(bias.outlet, cte.outlet) > 0.9
|
||||
cross join total.total
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
select
|
||||
outlet
|
||||
,count(1) as stories
|
||||
from stories
|
||||
group by outlet
|
||||
order by count(1) desc
|
||||
limit 50
|
||||
""")
|
||||
|
||||
outlets
|
||||
92
src/scrape.py
Normal file
92
src/scrape.py
Normal file
@@ -0,0 +1,92 @@
|
||||
from datetime import date, timedelta
|
||||
import datetime
|
||||
import requests
|
||||
from pathlib import Path
|
||||
import click
|
||||
from tqdm import tqdm
|
||||
from data import data_dir
|
||||
from lxml import etree
|
||||
import pandas as pd
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
...
|
||||
|
||||
@cli.command()
|
||||
@click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum")
|
||||
def download(output_dir):
|
||||
day = timedelta(days=1)
|
||||
cur = date(2005, 10, 1)
|
||||
end = date.today()
|
||||
dates = []
|
||||
while cur <= end:
|
||||
dates.append(cur)
|
||||
cur = cur + day
|
||||
date_iter = tqdm(dates, postfix="test")
|
||||
for i in date_iter:
|
||||
date_iter.set_postfix_str(f"{i}")
|
||||
save_as = output_dir / f"{i.strftime('%y-%m-%d')}.html"
|
||||
if save_as.exists():
|
||||
continue
|
||||
url = f"https://www.memeorandum.com/{i.strftime('%y%m%d')}/h2000"
|
||||
r = requests.get(url)
|
||||
with open(save_as, 'w') as f:
|
||||
f.write(r.text)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum")
|
||||
@click.option('-o', '--output_dir', type=Path, default=data_dir())
|
||||
def parse(directory, output_dir):
|
||||
directory = data_dir() / "memeorandum"
|
||||
parser = etree.HTMLParser()
|
||||
pages = [f for f in directory.glob("*.html")]
|
||||
published = []
|
||||
others = []
|
||||
#page = pages[0]
|
||||
page_iter = tqdm(pages, postfix="starting")
|
||||
for page in page_iter:
|
||||
page_iter.set_postfix_str(f"{page}")
|
||||
date = datetime.datetime.strptime(page.stem, '%y-%m-%d')
|
||||
# tree = etree.parse(str(page), parser)
|
||||
tree = etree.parse(str(page), parser)
|
||||
root = tree.getroot()
|
||||
items = root.xpath("//div[contains(@class, 'item')]")
|
||||
|
||||
for item in items:
|
||||
out = dict()
|
||||
citation = item.xpath('./cite')
|
||||
if not citation:
|
||||
continue
|
||||
author = citation[0]
|
||||
if author.text:
|
||||
author = ''.join(author.text.split('/')[:-1]).strip()
|
||||
else:
|
||||
author = ''
|
||||
out['author'] = author
|
||||
url = citation[0].getchildren()[0].get('href')
|
||||
publisher = citation[0].getchildren()[0].text
|
||||
out['publisher'] = publisher
|
||||
out['publisher_url'] = url
|
||||
title = item.xpath('.//strong/a')[0].text
|
||||
out['title'] = title
|
||||
item_id = hash((title,page.stem,url))
|
||||
out['id'] = item_id
|
||||
published.append(out)
|
||||
|
||||
related = item.xpath(".//span[contains(@class, 'mls')]/a")
|
||||
# relation = related[0]
|
||||
for relation in related:
|
||||
another = dict()
|
||||
another['url'] = relation.get('href')
|
||||
another['publisher'] = relation.text
|
||||
another['parent_id'] = item_id
|
||||
others.append(another)
|
||||
df = pd.DataFrame(published)
|
||||
df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
|
||||
df = pd.DataFrame(others)
|
||||
df.to_csv(output_dir / 'related.csv', sep='|', index=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
Reference in New Issue
Block a user