wwu-577/src/data/factcheck.py

import requests
from lxml import etree
from bs4 import BeautifulSoup
import re
from io import BytesIO
import pandas as pd
from pathlib import Path
import os
import sys
import click
from data.main import connect, map_tld, paths, reporting_label_to_int
from random import randint
from time import sleep
from tqdm import tqdm


@click.command('mbfc:parse-index')
def parse_index():
    parser = etree.HTMLParser()
    publishers = []
    for page in range(1, 54):
        url = f"https://mediabiasfactcheck.com/filtered-search/?pg={page}"
        print(f"downloading {url}", file=sys.stderr)
        response = requests.get(url)
        html = response.content
        tree = etree.parse(BytesIO(html), parser)
        rows = tree.xpath('//table[@class="mbfc-table"]/tbody/tr')
        print(f"parsing {len(rows)} rows", file=sys.stderr)
        for row in rows:
            publisher = {}
            link, bias, reporting, country, credibility, media_type, traffic, popularity = tuple(col for col in row.iterchildren())
            link = link.xpath('./a')[0]
            publisher['name'] = link.text
            publisher['detail_url'] = link.get('href')
            publisher['bias'] = bias.text
            publisher['reporting'] = reporting.text
            publisher['country'] = country.text
            publisher['credibility'] = credibility.text
            publisher['media_type'] = media_type.text
            publisher['traffic'] = traffic.text
            publisher['popularity'] = popularity.xpath('./span')[0].text
            publishers.append(publisher)
    df = pd.DataFrame(publishers)
    save_to = paths('data') / 'mbfc_bias.csv'
    df.to_csv(save_to, sep='|', index=False)
    print(f"saved {len(df)}: {save_to}", file=sys.stderr)

@click.command("mbfc:schema")
def schema():
    with connect() as db:
        db.sql("""create schema mbfc""")
        db.sql("""create or replace table mbfc.scrape (
            url text
            ,scraped_at datetime default now()
            )
        """)

@click.command("mbfc:scrape")
def scrape():

    df = pd.read_csv(paths('data') / 'mbfc_bias.csv', sep="|")

    with connect() as db:
        stats  = db.query("""
            select
                count(1) filter(where s.url is not null) as elapsed
                ,count(1) filter(where s.url is null) as remaining
            from df
            left join mbfc.scrape s
            on df.detail_url = s.url
        """).fetchall()
        df = db.query("""
            select
                detail_url as url
            from df
            where df.detail_url not in (
                select
                    url
                from mbfc.scrape
            )
        """).df()
    print(f"{stats[0][0]} elapsed. {stats[0][1]} remaining.")

    for url in df.url:
        delay = randint(1,3)
        save_as = paths('data') / 'mbfc' / (url.strip('/').split('/')[-1] + '.html')
        print(f"downloading (delay: {delay}): {url}", file=sys.stderr)
        sleep(delay)
        try:
            response = requests.get(url)
        except Exception as e:
            print(f"request failed: {url}", file=sys.stderr)
            continue
        with open(save_as, 'w') as f:
            f.write(response.text)
        with connect() as db:
            db.execute("""insert into mbfc.scrape (url) values (?)""", [url])
        print(f"saved: {save_as}", file=sys.stderr)

def load():

    publishers = []
    for i, page in enumerate(tqdm((paths('data') / 'mbfc').iterdir())):
        publisher = {}
        publisher['origin_url'] = f"https://mediabiasfactcheck.com/{page.stem}"
        with page.open() as p:
            tree = BeautifulSoup(p, 'html.parser')
        for e in tree(string=re.compile(r'source:', re.IGNORECASE)):
            e = e.parent
            while e.name != 'p':
                e = e.parent
            l = e.find('a')
            if l:
                publisher['tld'] = l.get('href')
                break
            else:
                breakpoint()
        publishers.append(publisher)
    df = pd.DataFrame(publishers)
    df.to_csv(paths('data') / 'mbfc_publisher_url.csv', index=False, sep="|")

@click.command('mbfc:create-tables')
def create_tables():

    pubs = pd.read_csv(paths('data') / 'mbfc_publishers.csv', sep='|')
    urls = pd.read_csv(paths('data') / 'mbfc_publisher_url.csv', sep="|")
    df = pubs.merge(urls, on='mbfc_url')
    df['tld'] = df.tld.apply(map_tld)
    df['ordinal'] = df.bias.apply(bias_label_to_int)

    with connect() as db:
        db.sql("""
            CREATE OR REPLACE TABLE mbfc.publishers AS
            SELECT
                row_number() over() as id
                ,p.tld
                ,mode(p.name) as name
                ,mode(p.bias) as bias
                ,mode(p.ordinal) as ordinal
                ,mode(p.reporting) as reporting
                ,mode(p.country) as country
                ,mode(p.credibility) as credibility
                ,mode(p.media_type) as media_type
                ,mode(p.traffic) as traffic
                ,mode(p.popularity) as popularity
            FROM df p
            GROUP BY
                p.tld
        """)

    with connect() as db:
        raw_stories = db.sql("""
            SELECT
                *
            FROM stories s
        """).df()

    raw_stories['tld'] = raw_stories.url.apply(map_tld)
    
    with connect() as db:
        db.sql("""
            CREATE OR REPLACE TABLE mbfc.publisher_stories AS
            SELECT
               s.id as story_id
               ,p.id as publisher_id
            FROM raw_stories s
            JOIN mbfc.publishers p
            ON p.tld = s.tld
        """)
    with connect() as db:
        data = db.sql("""
            select
                id,
                reporting
            from mbfc.publishers p
        """).df()

    with connect() as db:
        db.sql("""
            alter table mbfc.publishers add column reporting_ordinal int
        """)

    data['ordinal'] = data.reporting.apply(reporting_label_to_int)

    with connect() as db:
        db.sql("""
            update mbfc.publishers
            set reporting_ordinal = data.ordinal
            from data
            where data.id = publishers.id
        """)
add mbfc data. use context manager for db. add paths fn. 2023-06-01 09:44:01 -07:00			`import requests`
			`from lxml import etree`
			`from bs4 import BeautifulSoup`
			`import re`
			`from io import BytesIO`
			`import pandas as pd`
			`from pathlib import Path`
			`import os`
			`import sys`
			`import click`
rough draft of paper done. 2023-06-07 20:44:48 -07:00			`from data.main import connect, map_tld, paths, reporting_label_to_int`
add mbfc data. use context manager for db. add paths fn. 2023-06-01 09:44:01 -07:00			`from random import randint`
			`from time import sleep`
			`from tqdm import tqdm`


			`@click.command('mbfc:parse-index')`
			`def parse_index():`
			`parser = etree.HTMLParser()`
			`publishers = []`
			`for page in range(1, 54):`
			`url = f"https://mediabiasfactcheck.com/filtered-search/?pg={page}"`
			`print(f"downloading {url}", file=sys.stderr)`
			`response = requests.get(url)`
			`html = response.content`
			`tree = etree.parse(BytesIO(html), parser)`
			`rows = tree.xpath('//table[@class="mbfc-table"]/tbody/tr')`
			`print(f"parsing {len(rows)} rows", file=sys.stderr)`
			`for row in rows:`
			`publisher = {}`
			`link, bias, reporting, country, credibility, media_type, traffic, popularity = tuple(col for col in row.iterchildren())`
			`link = link.xpath('./a')[0]`
			`publisher['name'] = link.text`
			`publisher['detail_url'] = link.get('href')`
			`publisher['bias'] = bias.text`
			`publisher['reporting'] = reporting.text`
			`publisher['country'] = country.text`
			`publisher['credibility'] = credibility.text`
			`publisher['media_type'] = media_type.text`
			`publisher['traffic'] = traffic.text`
			`publisher['popularity'] = popularity.xpath('./span')[0].text`
			`publishers.append(publisher)`
			`df = pd.DataFrame(publishers)`
			`save_to = paths('data') / 'mbfc_bias.csv'`
			`df.to_csv(save_to, sep='\|', index=False)`
			`print(f"saved {len(df)}: {save_to}", file=sys.stderr)`

			`@click.command("mbfc:schema")`
			`def schema():`
			`with connect() as db:`
			`db.sql("""create schema mbfc""")`
			`db.sql("""create or replace table mbfc.scrape (`
			`url text`
			`,scraped_at datetime default now()`
			`)`
			`""")`

			`@click.command("mbfc:scrape")`
			`def scrape():`

			`df = pd.read_csv(paths('data') / 'mbfc_bias.csv', sep="\|")`

			`with connect() as db:`
			`stats = db.query("""`
			`select`
			`count(1) filter(where s.url is not null) as elapsed`
			`,count(1) filter(where s.url is null) as remaining`
			`from df`
			`left join mbfc.scrape s`
			`on df.detail_url = s.url`
			`""").fetchall()`
			`df = db.query("""`
			`select`
			`detail_url as url`
			`from df`
			`where df.detail_url not in (`
			`select`
			`url`
			`from mbfc.scrape`
			`)`
			`""").df()`
			`print(f"{stats[0][0]} elapsed. {stats[0][1]} remaining.")`

			`for url in df.url:`
			`delay = randint(1,3)`
			`save_as = paths('data') / 'mbfc' / (url.strip('/').split('/')[-1] + '.html')`
			`print(f"downloading (delay: {delay}): {url}", file=sys.stderr)`
			`sleep(delay)`
			`try:`
			`response = requests.get(url)`
			`except Exception as e:`
			`print(f"request failed: {url}", file=sys.stderr)`
			`continue`
			`with open(save_as, 'w') as f:`
			`f.write(response.text)`
			`with connect() as db:`
			`db.execute("""insert into mbfc.scrape (url) values (?)""", [url])`
			`print(f"saved: {save_as}", file=sys.stderr)`

			`def load():`

			`publishers = []`
			`for i, page in enumerate(tqdm((paths('data') / 'mbfc').iterdir())):`
			`publisher = {}`
			`publisher['origin_url'] = f"https://mediabiasfactcheck.com/{page.stem}"`
			`with page.open() as p:`
			`tree = BeautifulSoup(p, 'html.parser')`
			`for e in tree(string=re.compile(r'source:', re.IGNORECASE)):`
			`e = e.parent`
			`while e.name != 'p':`
			`e = e.parent`
			`l = e.find('a')`
			`if l:`
			`publisher['tld'] = l.get('href')`
			`break`
			`else:`
			`breakpoint()`
			`publishers.append(publisher)`
			`df = pd.DataFrame(publishers)`
			`df.to_csv(paths('data') / 'mbfc_publisher_url.csv', index=False, sep="\|")`

			`@click.command('mbfc:create-tables')`
			`def create_tables():`

			`pubs = pd.read_csv(paths('data') / 'mbfc_publishers.csv', sep='\|')`
			`urls = pd.read_csv(paths('data') / 'mbfc_publisher_url.csv', sep="\|")`
			`df = pubs.merge(urls, on='mbfc_url')`
			`df['tld'] = df.tld.apply(map_tld)`
			`df['ordinal'] = df.bias.apply(bias_label_to_int)`

			`with connect() as db:`
			`db.sql("""`
			`CREATE OR REPLACE TABLE mbfc.publishers AS`
			`SELECT`
			`row_number() over() as id`
			`,p.tld`
			`,mode(p.name) as name`
			`,mode(p.bias) as bias`
			`,mode(p.ordinal) as ordinal`
			`,mode(p.reporting) as reporting`
			`,mode(p.country) as country`
			`,mode(p.credibility) as credibility`
			`,mode(p.media_type) as media_type`
			`,mode(p.traffic) as traffic`
			`,mode(p.popularity) as popularity`
			`FROM df p`
			`GROUP BY`
			`p.tld`
			`""")`

			`with connect() as db:`
			`raw_stories = db.sql("""`
			`SELECT`
			`*`
			`FROM stories s`
			`""").df()`

rough draft of paper done. 2023-06-07 20:44:48 -07:00			`raw_stories['tld'] = raw_stories.url.apply(map_tld)`
add mbfc data. use context manager for db. add paths fn. 2023-06-01 09:44:01 -07:00
			`with connect() as db:`
			`db.sql("""`
			`CREATE OR REPLACE TABLE mbfc.publisher_stories AS`
			`SELECT`
			`s.id as story_id`
			`,p.id as publisher_id`
			`FROM raw_stories s`
			`JOIN mbfc.publishers p`
			`ON p.tld = s.tld`
			`""")`
rough draft of paper done. 2023-06-07 20:44:48 -07:00			`with connect() as db:`
			`data = db.sql("""`
			`select`
			`id,`
			`reporting`
			`from mbfc.publishers p`
			`""").df()`

			`with connect() as db:`
			`db.sql("""`
			`alter table mbfc.publishers add column reporting_ordinal int`
			`""")`
add mbfc data. use context manager for db. add paths fn. 2023-06-01 09:44:01 -07:00
rough draft of paper done. 2023-06-07 20:44:48 -07:00			`data['ordinal'] = data.reporting.apply(reporting_label_to_int)`
add mbfc data. use context manager for db. add paths fn. 2023-06-01 09:44:01 -07:00
rough draft of paper done. 2023-06-07 20:44:48 -07:00			`with connect() as db:`
			`db.sql("""`
			`update mbfc.publishers`
			`set reporting_ordinal = data.ordinal`
			`from data`
			`where data.id = publishers.id`
			`""")`