import requests from lxml import etree from bs4 import BeautifulSoup import re from io import BytesIO import pandas as pd from pathlib import Path import os import sys import click from data.main import connect, map_tld, paths, reporting_label_to_int, bias_label_to_int from random import randint from time import sleep from tqdm import tqdm @click.command('mbfc:parse-index') def parse_index(): parser = etree.HTMLParser() publishers = [] for page in range(1, 54): url = f"https://mediabiasfactcheck.com/filtered-search/?pg={page}" print(f"downloading {url}", file=sys.stderr) response = requests.get(url) html = response.content tree = etree.parse(BytesIO(html), parser) rows = tree.xpath('//table[@class="mbfc-table"]/tbody/tr') print(f"parsing {len(rows)} rows", file=sys.stderr) for row in rows: publisher = {} link, bias, reporting, country, credibility, media_type, traffic, popularity = tuple(col for col in row.iterchildren()) link = link.xpath('./a')[0] publisher['name'] = link.text publisher['detail_url'] = link.get('href') publisher['bias'] = bias.text publisher['reporting'] = reporting.text publisher['country'] = country.text publisher['credibility'] = credibility.text publisher['media_type'] = media_type.text publisher['traffic'] = traffic.text publisher['popularity'] = popularity.xpath('./span')[0].text publishers.append(publisher) df = pd.DataFrame(publishers) save_to = paths('data') / 'mbfc_bias.csv' df.to_csv(save_to, sep='|', index=False) print(f"saved {len(df)}: {save_to}", file=sys.stderr) @click.command("mbfc:schema") def schema(): with connect() as db: db.sql("""create schema mbfc""") db.sql("""create or replace table mbfc.scrape ( url text ,scraped_at datetime default now() ) """) @click.command("mbfc:scrape") def scrape(): df = pd.read_csv(paths('data') / 'mbfc_bias.csv', sep="|") with connect() as db: stats = db.query(""" select count(1) filter(where s.url is not null) as elapsed ,count(1) filter(where s.url is null) as remaining from df left join mbfc.scrape s on df.detail_url = s.url """).fetchall() df = db.query(""" select detail_url as url from df where df.detail_url not in ( select url from mbfc.scrape ) """).df() print(f"{stats[0][0]} elapsed. {stats[0][1]} remaining.") for url in df.url: delay = randint(1,3) save_as = paths('data') / 'mbfc' / (url.strip('/').split('/')[-1] + '.html') print(f"downloading (delay: {delay}): {url}", file=sys.stderr) sleep(delay) try: response = requests.get(url) except Exception as e: print(f"request failed: {url}", file=sys.stderr) continue with open(save_as, 'w') as f: f.write(response.text) with connect() as db: db.execute("""insert into mbfc.scrape (url) values (?)""", [url]) print(f"saved: {save_as}", file=sys.stderr) def load(): publishers = [] for i, page in enumerate(tqdm((paths('data') / 'mbfc').iterdir())): publisher = {} publisher['origin_url'] = f"https://mediabiasfactcheck.com/{page.stem}" with page.open() as p: tree = BeautifulSoup(p, 'html.parser') for e in tree(string=re.compile(r'source:', re.IGNORECASE)): e = e.parent while e.name != 'p': e = e.parent l = e.find('a') if l: publisher['tld'] = l.get('href') break else: breakpoint() publishers.append(publisher) df = pd.DataFrame(publishers) df.to_csv(paths('data') / 'mbfc_publisher_url.csv', index=False, sep="|") @click.command('mbfc:create-tables') def create_tables(): pubs = pd.read_csv(paths('data') / 'mbfc_publishers.csv', sep='|') urls = pd.read_csv(paths('data') / 'mbfc_publisher_url.csv', sep="|") df = pubs.merge(urls, on='mbfc_url') df['tld'] = df.tld.apply(map_tld) df['ordinal'] = df.bias.apply(bias_label_to_int) df.tld with connect() as db: db.sql(""" CREATE OR REPLACE TABLE mbfc.publishers AS SELECT row_number() over() as id ,p.tld ,mode(p.name) as name ,mode(p.bias) as bias ,mode(p.ordinal) as ordinal ,mode(p.reporting) as reporting ,mode(p.country) as country ,mode(p.credibility) as credibility ,mode(p.media_type) as media_type ,mode(p.traffic) as traffic ,mode(p.popularity) as popularity FROM df p GROUP BY p.tld """) with connect() as db: raw_stories = db.sql(""" SELECT * FROM stories s """).df() raw_stories['tld'] = raw_stories.url.apply(map_tld) with connect() as db: db.sql(""" CREATE OR REPLACE TABLE mbfc.publisher_stories AS SELECT s.id as story_id ,p.id as publisher_id FROM raw_stories s JOIN mbfc.publishers p ON p.tld = s.tld """) with connect() as db: data = db.sql(""" select id, reporting from mbfc.publishers p """).df() with connect() as db: db.sql(""" alter table mbfc.publishers add column reporting_ordinal int """) data['ordinal'] = data.reporting.apply(reporting_label_to_int) with connect() as db: db.sql(""" update mbfc.publishers set reporting_ordinal = data.ordinal from data where data.id = publishers.id """)