wwu-577/src/data/factcheck.py

194 lines
6.1 KiB
Python
Raw Permalink Normal View History

import requests
from lxml import etree
from bs4 import BeautifulSoup
import re
from io import BytesIO
import pandas as pd
from pathlib import Path
import os
import sys
import click
2023-12-15 09:34:18 -08:00
from data.main import connect, map_tld, paths, reporting_label_to_int, bias_label_to_int
from random import randint
from time import sleep
from tqdm import tqdm
@click.command('mbfc:parse-index')
def parse_index():
parser = etree.HTMLParser()
publishers = []
for page in range(1, 54):
url = f"https://mediabiasfactcheck.com/filtered-search/?pg={page}"
print(f"downloading {url}", file=sys.stderr)
response = requests.get(url)
html = response.content
tree = etree.parse(BytesIO(html), parser)
rows = tree.xpath('//table[@class="mbfc-table"]/tbody/tr')
print(f"parsing {len(rows)} rows", file=sys.stderr)
for row in rows:
publisher = {}
link, bias, reporting, country, credibility, media_type, traffic, popularity = tuple(col for col in row.iterchildren())
link = link.xpath('./a')[0]
publisher['name'] = link.text
publisher['detail_url'] = link.get('href')
publisher['bias'] = bias.text
publisher['reporting'] = reporting.text
publisher['country'] = country.text
publisher['credibility'] = credibility.text
publisher['media_type'] = media_type.text
publisher['traffic'] = traffic.text
publisher['popularity'] = popularity.xpath('./span')[0].text
publishers.append(publisher)
df = pd.DataFrame(publishers)
save_to = paths('data') / 'mbfc_bias.csv'
df.to_csv(save_to, sep='|', index=False)
print(f"saved {len(df)}: {save_to}", file=sys.stderr)
@click.command("mbfc:schema")
def schema():
with connect() as db:
db.sql("""create schema mbfc""")
db.sql("""create or replace table mbfc.scrape (
url text
,scraped_at datetime default now()
)
""")
@click.command("mbfc:scrape")
def scrape():
df = pd.read_csv(paths('data') / 'mbfc_bias.csv', sep="|")
with connect() as db:
stats = db.query("""
select
count(1) filter(where s.url is not null) as elapsed
,count(1) filter(where s.url is null) as remaining
from df
left join mbfc.scrape s
on df.detail_url = s.url
""").fetchall()
df = db.query("""
select
detail_url as url
from df
where df.detail_url not in (
select
url
from mbfc.scrape
)
""").df()
print(f"{stats[0][0]} elapsed. {stats[0][1]} remaining.")
for url in df.url:
delay = randint(1,3)
save_as = paths('data') / 'mbfc' / (url.strip('/').split('/')[-1] + '.html')
print(f"downloading (delay: {delay}): {url}", file=sys.stderr)
sleep(delay)
try:
response = requests.get(url)
except Exception as e:
print(f"request failed: {url}", file=sys.stderr)
continue
with open(save_as, 'w') as f:
f.write(response.text)
with connect() as db:
db.execute("""insert into mbfc.scrape (url) values (?)""", [url])
print(f"saved: {save_as}", file=sys.stderr)
def load():
publishers = []
for i, page in enumerate(tqdm((paths('data') / 'mbfc').iterdir())):
publisher = {}
publisher['origin_url'] = f"https://mediabiasfactcheck.com/{page.stem}"
with page.open() as p:
tree = BeautifulSoup(p, 'html.parser')
for e in tree(string=re.compile(r'source:', re.IGNORECASE)):
e = e.parent
while e.name != 'p':
e = e.parent
l = e.find('a')
if l:
publisher['tld'] = l.get('href')
break
else:
breakpoint()
publishers.append(publisher)
df = pd.DataFrame(publishers)
df.to_csv(paths('data') / 'mbfc_publisher_url.csv', index=False, sep="|")
@click.command('mbfc:create-tables')
def create_tables():
pubs = pd.read_csv(paths('data') / 'mbfc_publishers.csv', sep='|')
urls = pd.read_csv(paths('data') / 'mbfc_publisher_url.csv', sep="|")
df = pubs.merge(urls, on='mbfc_url')
df['tld'] = df.tld.apply(map_tld)
df['ordinal'] = df.bias.apply(bias_label_to_int)
2023-12-15 09:34:18 -08:00
df.tld
with connect() as db:
db.sql("""
CREATE OR REPLACE TABLE mbfc.publishers AS
SELECT
row_number() over() as id
,p.tld
,mode(p.name) as name
,mode(p.bias) as bias
,mode(p.ordinal) as ordinal
,mode(p.reporting) as reporting
,mode(p.country) as country
,mode(p.credibility) as credibility
,mode(p.media_type) as media_type
,mode(p.traffic) as traffic
,mode(p.popularity) as popularity
FROM df p
GROUP BY
p.tld
""")
with connect() as db:
raw_stories = db.sql("""
SELECT
*
FROM stories s
""").df()
2023-06-07 20:44:48 -07:00
raw_stories['tld'] = raw_stories.url.apply(map_tld)
with connect() as db:
db.sql("""
CREATE OR REPLACE TABLE mbfc.publisher_stories AS
SELECT
s.id as story_id
,p.id as publisher_id
FROM raw_stories s
JOIN mbfc.publishers p
ON p.tld = s.tld
""")
2023-06-07 20:44:48 -07:00
with connect() as db:
data = db.sql("""
select
id,
reporting
from mbfc.publishers p
""").df()
with connect() as db:
db.sql("""
alter table mbfc.publishers add column reporting_ordinal int
""")
2023-06-07 20:44:48 -07:00
data['ordinal'] = data.reporting.apply(reporting_label_to_int)
2023-06-07 20:44:48 -07:00
with connect() as db:
db.sql("""
update mbfc.publishers
set reporting_ordinal = data.ordinal
from data
where data.id = publishers.id
""")