2023-06-01 09:44:01 -07:00
|
|
|
import requests
|
|
|
|
from lxml import etree
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
import re
|
|
|
|
from io import BytesIO
|
|
|
|
import pandas as pd
|
|
|
|
from pathlib import Path
|
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import click
|
2023-12-15 09:34:18 -08:00
|
|
|
from data.main import connect, map_tld, paths, reporting_label_to_int, bias_label_to_int
|
2023-06-01 09:44:01 -07:00
|
|
|
from random import randint
|
|
|
|
from time import sleep
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
|
|
|
|
|
|
@click.command('mbfc:parse-index')
|
|
|
|
def parse_index():
|
|
|
|
parser = etree.HTMLParser()
|
|
|
|
publishers = []
|
|
|
|
for page in range(1, 54):
|
|
|
|
url = f"https://mediabiasfactcheck.com/filtered-search/?pg={page}"
|
|
|
|
print(f"downloading {url}", file=sys.stderr)
|
|
|
|
response = requests.get(url)
|
|
|
|
html = response.content
|
|
|
|
tree = etree.parse(BytesIO(html), parser)
|
|
|
|
rows = tree.xpath('//table[@class="mbfc-table"]/tbody/tr')
|
|
|
|
print(f"parsing {len(rows)} rows", file=sys.stderr)
|
|
|
|
for row in rows:
|
|
|
|
publisher = {}
|
|
|
|
link, bias, reporting, country, credibility, media_type, traffic, popularity = tuple(col for col in row.iterchildren())
|
|
|
|
link = link.xpath('./a')[0]
|
|
|
|
publisher['name'] = link.text
|
|
|
|
publisher['detail_url'] = link.get('href')
|
|
|
|
publisher['bias'] = bias.text
|
|
|
|
publisher['reporting'] = reporting.text
|
|
|
|
publisher['country'] = country.text
|
|
|
|
publisher['credibility'] = credibility.text
|
|
|
|
publisher['media_type'] = media_type.text
|
|
|
|
publisher['traffic'] = traffic.text
|
|
|
|
publisher['popularity'] = popularity.xpath('./span')[0].text
|
|
|
|
publishers.append(publisher)
|
|
|
|
df = pd.DataFrame(publishers)
|
|
|
|
save_to = paths('data') / 'mbfc_bias.csv'
|
|
|
|
df.to_csv(save_to, sep='|', index=False)
|
|
|
|
print(f"saved {len(df)}: {save_to}", file=sys.stderr)
|
|
|
|
|
|
|
|
@click.command("mbfc:schema")
|
|
|
|
def schema():
|
|
|
|
with connect() as db:
|
|
|
|
db.sql("""create schema mbfc""")
|
|
|
|
db.sql("""create or replace table mbfc.scrape (
|
|
|
|
url text
|
|
|
|
,scraped_at datetime default now()
|
|
|
|
)
|
|
|
|
""")
|
|
|
|
|
|
|
|
@click.command("mbfc:scrape")
|
|
|
|
def scrape():
|
|
|
|
|
|
|
|
df = pd.read_csv(paths('data') / 'mbfc_bias.csv', sep="|")
|
|
|
|
|
|
|
|
with connect() as db:
|
|
|
|
stats = db.query("""
|
|
|
|
select
|
|
|
|
count(1) filter(where s.url is not null) as elapsed
|
|
|
|
,count(1) filter(where s.url is null) as remaining
|
|
|
|
from df
|
|
|
|
left join mbfc.scrape s
|
|
|
|
on df.detail_url = s.url
|
|
|
|
""").fetchall()
|
|
|
|
df = db.query("""
|
|
|
|
select
|
|
|
|
detail_url as url
|
|
|
|
from df
|
|
|
|
where df.detail_url not in (
|
|
|
|
select
|
|
|
|
url
|
|
|
|
from mbfc.scrape
|
|
|
|
)
|
|
|
|
""").df()
|
|
|
|
print(f"{stats[0][0]} elapsed. {stats[0][1]} remaining.")
|
|
|
|
|
|
|
|
for url in df.url:
|
|
|
|
delay = randint(1,3)
|
|
|
|
save_as = paths('data') / 'mbfc' / (url.strip('/').split('/')[-1] + '.html')
|
|
|
|
print(f"downloading (delay: {delay}): {url}", file=sys.stderr)
|
|
|
|
sleep(delay)
|
|
|
|
try:
|
|
|
|
response = requests.get(url)
|
|
|
|
except Exception as e:
|
|
|
|
print(f"request failed: {url}", file=sys.stderr)
|
|
|
|
continue
|
|
|
|
with open(save_as, 'w') as f:
|
|
|
|
f.write(response.text)
|
|
|
|
with connect() as db:
|
|
|
|
db.execute("""insert into mbfc.scrape (url) values (?)""", [url])
|
|
|
|
print(f"saved: {save_as}", file=sys.stderr)
|
|
|
|
|
|
|
|
def load():
|
|
|
|
|
|
|
|
publishers = []
|
|
|
|
for i, page in enumerate(tqdm((paths('data') / 'mbfc').iterdir())):
|
|
|
|
publisher = {}
|
|
|
|
publisher['origin_url'] = f"https://mediabiasfactcheck.com/{page.stem}"
|
|
|
|
with page.open() as p:
|
|
|
|
tree = BeautifulSoup(p, 'html.parser')
|
|
|
|
for e in tree(string=re.compile(r'source:', re.IGNORECASE)):
|
|
|
|
e = e.parent
|
|
|
|
while e.name != 'p':
|
|
|
|
e = e.parent
|
|
|
|
l = e.find('a')
|
|
|
|
if l:
|
|
|
|
publisher['tld'] = l.get('href')
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
breakpoint()
|
|
|
|
publishers.append(publisher)
|
|
|
|
df = pd.DataFrame(publishers)
|
|
|
|
df.to_csv(paths('data') / 'mbfc_publisher_url.csv', index=False, sep="|")
|
|
|
|
|
|
|
|
@click.command('mbfc:create-tables')
|
|
|
|
def create_tables():
|
|
|
|
|
|
|
|
pubs = pd.read_csv(paths('data') / 'mbfc_publishers.csv', sep='|')
|
|
|
|
urls = pd.read_csv(paths('data') / 'mbfc_publisher_url.csv', sep="|")
|
|
|
|
df = pubs.merge(urls, on='mbfc_url')
|
|
|
|
df['tld'] = df.tld.apply(map_tld)
|
|
|
|
df['ordinal'] = df.bias.apply(bias_label_to_int)
|
|
|
|
|
2023-12-15 09:34:18 -08:00
|
|
|
df.tld
|
|
|
|
|
2023-06-01 09:44:01 -07:00
|
|
|
with connect() as db:
|
|
|
|
db.sql("""
|
|
|
|
CREATE OR REPLACE TABLE mbfc.publishers AS
|
|
|
|
SELECT
|
|
|
|
row_number() over() as id
|
|
|
|
,p.tld
|
|
|
|
,mode(p.name) as name
|
|
|
|
,mode(p.bias) as bias
|
|
|
|
,mode(p.ordinal) as ordinal
|
|
|
|
,mode(p.reporting) as reporting
|
|
|
|
,mode(p.country) as country
|
|
|
|
,mode(p.credibility) as credibility
|
|
|
|
,mode(p.media_type) as media_type
|
|
|
|
,mode(p.traffic) as traffic
|
|
|
|
,mode(p.popularity) as popularity
|
|
|
|
FROM df p
|
|
|
|
GROUP BY
|
|
|
|
p.tld
|
|
|
|
""")
|
|
|
|
|
|
|
|
with connect() as db:
|
|
|
|
raw_stories = db.sql("""
|
|
|
|
SELECT
|
|
|
|
*
|
|
|
|
FROM stories s
|
|
|
|
""").df()
|
|
|
|
|
2023-06-07 20:44:48 -07:00
|
|
|
raw_stories['tld'] = raw_stories.url.apply(map_tld)
|
2023-06-01 09:44:01 -07:00
|
|
|
|
|
|
|
with connect() as db:
|
|
|
|
db.sql("""
|
|
|
|
CREATE OR REPLACE TABLE mbfc.publisher_stories AS
|
|
|
|
SELECT
|
|
|
|
s.id as story_id
|
|
|
|
,p.id as publisher_id
|
|
|
|
FROM raw_stories s
|
|
|
|
JOIN mbfc.publishers p
|
|
|
|
ON p.tld = s.tld
|
|
|
|
""")
|
2023-06-07 20:44:48 -07:00
|
|
|
with connect() as db:
|
|
|
|
data = db.sql("""
|
|
|
|
select
|
|
|
|
id,
|
|
|
|
reporting
|
|
|
|
from mbfc.publishers p
|
|
|
|
""").df()
|
|
|
|
|
|
|
|
with connect() as db:
|
|
|
|
db.sql("""
|
|
|
|
alter table mbfc.publishers add column reporting_ordinal int
|
|
|
|
""")
|
2023-06-01 09:44:01 -07:00
|
|
|
|
2023-06-07 20:44:48 -07:00
|
|
|
data['ordinal'] = data.reporting.apply(reporting_label_to_int)
|
2023-06-01 09:44:01 -07:00
|
|
|
|
2023-06-07 20:44:48 -07:00
|
|
|
with connect() as db:
|
|
|
|
db.sql("""
|
|
|
|
update mbfc.publishers
|
|
|
|
set reporting_ordinal = data.ordinal
|
|
|
|
from data
|
|
|
|
where data.id = publishers.id
|
|
|
|
""")
|