add mbfc data. use context manager for db. add paths fn.

This commit is contained in:
matt
2023-06-01 09:44:01 -07:00
parent 398228f02c
commit 79808f01d6
40 changed files with 1354 additions and 1137 deletions

View File

@@ -1,6 +1,10 @@
import data.main
import data.scrape
import data.factcheck
import data.links
__all__ = [
'main'
,'scrape'
,'factcheck'
,'links'
]

171
src/data/factcheck.py Normal file
View File

@@ -0,0 +1,171 @@
import requests
from lxml import etree
from bs4 import BeautifulSoup
import re
from io import BytesIO
import pandas as pd
from pathlib import Path
import os
import sys
import click
from data.main import connect, map_tld, paths
from random import randint
from time import sleep
from tqdm import tqdm
@click.command('mbfc:parse-index')
def parse_index():
parser = etree.HTMLParser()
publishers = []
for page in range(1, 54):
url = f"https://mediabiasfactcheck.com/filtered-search/?pg={page}"
print(f"downloading {url}", file=sys.stderr)
response = requests.get(url)
html = response.content
tree = etree.parse(BytesIO(html), parser)
rows = tree.xpath('//table[@class="mbfc-table"]/tbody/tr')
print(f"parsing {len(rows)} rows", file=sys.stderr)
for row in rows:
publisher = {}
link, bias, reporting, country, credibility, media_type, traffic, popularity = tuple(col for col in row.iterchildren())
link = link.xpath('./a')[0]
publisher['name'] = link.text
publisher['detail_url'] = link.get('href')
publisher['bias'] = bias.text
publisher['reporting'] = reporting.text
publisher['country'] = country.text
publisher['credibility'] = credibility.text
publisher['media_type'] = media_type.text
publisher['traffic'] = traffic.text
publisher['popularity'] = popularity.xpath('./span')[0].text
publishers.append(publisher)
df = pd.DataFrame(publishers)
save_to = paths('data') / 'mbfc_bias.csv'
df.to_csv(save_to, sep='|', index=False)
print(f"saved {len(df)}: {save_to}", file=sys.stderr)
@click.command("mbfc:schema")
def schema():
with connect() as db:
db.sql("""create schema mbfc""")
db.sql("""create or replace table mbfc.scrape (
url text
,scraped_at datetime default now()
)
""")
@click.command("mbfc:scrape")
def scrape():
df = pd.read_csv(paths('data') / 'mbfc_bias.csv', sep="|")
with connect() as db:
stats = db.query("""
select
count(1) filter(where s.url is not null) as elapsed
,count(1) filter(where s.url is null) as remaining
from df
left join mbfc.scrape s
on df.detail_url = s.url
""").fetchall()
df = db.query("""
select
detail_url as url
from df
where df.detail_url not in (
select
url
from mbfc.scrape
)
""").df()
print(f"{stats[0][0]} elapsed. {stats[0][1]} remaining.")
for url in df.url:
delay = randint(1,3)
save_as = paths('data') / 'mbfc' / (url.strip('/').split('/')[-1] + '.html')
print(f"downloading (delay: {delay}): {url}", file=sys.stderr)
sleep(delay)
try:
response = requests.get(url)
except Exception as e:
print(f"request failed: {url}", file=sys.stderr)
continue
with open(save_as, 'w') as f:
f.write(response.text)
with connect() as db:
db.execute("""insert into mbfc.scrape (url) values (?)""", [url])
print(f"saved: {save_as}", file=sys.stderr)
def load():
publishers = []
for i, page in enumerate(tqdm((paths('data') / 'mbfc').iterdir())):
publisher = {}
publisher['origin_url'] = f"https://mediabiasfactcheck.com/{page.stem}"
with page.open() as p:
tree = BeautifulSoup(p, 'html.parser')
for e in tree(string=re.compile(r'source:', re.IGNORECASE)):
e = e.parent
while e.name != 'p':
e = e.parent
l = e.find('a')
if l:
publisher['tld'] = l.get('href')
break
else:
breakpoint()
publishers.append(publisher)
df = pd.DataFrame(publishers)
df.to_csv(paths('data') / 'mbfc_publisher_url.csv', index=False, sep="|")
@click.command('mbfc:create-tables')
def create_tables():
pubs = pd.read_csv(paths('data') / 'mbfc_publishers.csv', sep='|')
urls = pd.read_csv(paths('data') / 'mbfc_publisher_url.csv', sep="|")
df = pubs.merge(urls, on='mbfc_url')
df['tld'] = df.tld.apply(map_tld)
df['ordinal'] = df.bias.apply(bias_label_to_int)
with connect() as db:
db.sql("""
CREATE OR REPLACE TABLE mbfc.publishers AS
SELECT
row_number() over() as id
,p.tld
,mode(p.name) as name
,mode(p.bias) as bias
,mode(p.ordinal) as ordinal
,mode(p.reporting) as reporting
,mode(p.country) as country
,mode(p.credibility) as credibility
,mode(p.media_type) as media_type
,mode(p.traffic) as traffic
,mode(p.popularity) as popularity
FROM df p
GROUP BY
p.tld
""")
with connect() as db:
raw_stories = db.sql("""
SELECT
*
FROM stories s
""").df()
stories['tld'] = stories.url.apply(map_tld)
with connect() as db:
db.sql("""
CREATE OR REPLACE TABLE mbfc.publisher_stories AS
SELECT
s.id as story_id
,p.id as publisher_id
FROM raw_stories s
JOIN mbfc.publishers p
ON p.tld = s.tld
""")

135
src/data/links.py Normal file
View File

@@ -0,0 +1,135 @@
import click
from data.main import connect
import pandas as pd
@click.command('links:create-table')
def create_table():
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE link_edges AS
with cte as(
SELECT
s.publisher_id as parent_id
,r.publisher_id as child_id
,count(1) as links
FROM stories s
JOIN related_stories r
ON s.id = r.parent_id
group by
s.publisher_id
,r.publisher_id
)
SELECT
cte.parent_id
,cte.child_id
,cte.links as links
,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
,case when cte.links > 0 then 1 else 0 end as onehot
FROM cte
WHERE cte.child_id in (
SELECT
distinct parent_id
FROM cte
)
AND cte.parent_id in (
SELECT
distinct child_id
FROM cte
)
""")
db.query("""
SELECT
*
,count(1) over()
FROM link_edges e
limit 1
""")
print(f"created link_edges")
@click.command('links:create-pca')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_pca(source):
"""create 2D pca labels"""
from sklearn.decomposition import PCA
table_name = f"publisher_pca_{source}"
with connect() as db:
pub = db.query("""
SELECT
p.*
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON p.id = ps.publisher_id
""").df()
df = db.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM link_edges
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
svd = PCA(n_components=2)
svd_out = svd.fit_transform(pivot)
out = pivot.reset_index()[['parent_id']]
out['first'] = svd_out[:, 0]
out['second'] = svd_out[:, 1]
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
out.id as publisher_id
,out.first as first
,out.second as second
FROM out
""")
print(f"created {table_name}")
@click.command('links:create-clusters')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_clusters(source):
"""create link adj. matrix clusters table"""
from sklearn.cluster import KMeans
table_name = f"publisher_clusters_{source}"
with connect() as db:
df = db.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM link_edges
""").df()
pub = db.query("""
SELECT
p.*
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
k = 8
kmeans = KMeans(n_clusters=k, n_init="auto")
pred = kmeans.fit_predict(pivot)
out = pivot.reset_index()[['parent_id']]
out['label'] = pred
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
new_table = out[['id', 'label']]
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
n.id as publisher_id
,n.label as label
FROM new_table n
""")
print(f"created {table_name}")

View File

@@ -2,6 +2,10 @@ import os
from pathlib import Path
import duckdb
from enum import Enum
from urllib.parse import urlparse
from tld import get_tld
from tld.utils import update_tld_names
import sys
class Data(str, Enum):
Titles = 'titles'
@@ -9,6 +13,16 @@ class Data(str, Enum):
def data_dir():
return Path(os.environ['DATA_MINING_DATA_DIR'])
def paths(name='app'):
if 'app' in name:
return Path(os.environ['DATA_MINING_APP_DIR'])
if 'data' in name:
return Path(os.environ['DATA_MINING_DATA_DIR'])
if 'doc' in name:
return Path(os.environ['DATA_MINING_DOCS_DIR'])
if 'figure' in name:
return Path(os.environ['DATA_MINING_DOCS_DIR']) / 'figures'
def connect():
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
# APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
@@ -28,3 +42,66 @@ def from_db(t: Data):
limit 100
""").df()
return table
def map_tld(x):
try:
res = get_tld(x, as_object=True)
return res.fld
except:
print(f"'{x}' is not valid.", file=sys.stderr)
return None
def ticklabels():
return [
'Left',
'Left-Center',
'Least Biased',
'Right-Center',
'Right',
]
def bias_label_to_int(rating:str, source: str = 'mbfc') -> int:
if source == 'mbfc':
mapping = {
'Left' : 0,
'Left-Center' : 1,
'Least Biased' : 2,
'Right-Center' : 3,
'Right' : 4,
}
else:
mapping = {
'left' : 0,
'left-center' : 1,
'center' : 2,
'right-center' : 3,
'right' : 4,
}
try:
return mapping[rating]
except:
print(f"no mapping for {rating}", file=sys.stderr)
return -1
def bias_int_to_label(class_id: int, source: str = 'mbfc') -> str:
if source == 'mbfc':
mapping = {
0 : 'Left',
1 : 'Left-Center',
2 : 'Least Biased',
3 : 'Right-Center',
4 : 'Right',
}
else:
mapping = {
0 : 'left',
1 : 'left-center',
2 : 'center',
3 : 'right-center',
4 : 'right',
}
try:
return mapping[class_id]
except:
print(f"no mapping for {class_id}", file=sys.stderr)
return -1

View File

@@ -319,12 +319,6 @@ def another_norm():
""")
def map_tld(x):
try:
res = get_tld(x, as_object=True)
return res.fld
except:
return None
DB.sql("""
SELECT

47
src/data/selection.py Normal file
View File

@@ -0,0 +1,47 @@
from data.main import connect
import pandas as pd
import numpy as np
def create_tables():
with connect() as db:
edges = db.query("""
select
*
from link_edges
""").df()
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
with connect() as db:
db.query("create schema top")
db.query("""
CREATE OR REPLACE TABLE top.publishers AS
SELECT
p.*
FROM publishers p
JOIN select_publishers s
ON s.publisher_id = p.id
""")
db.query("""
CREATE OR REPLACE TABLE top.stories AS
SELECT
s.*
FROM stories s
JOIN top.publishers p
ON s.publisher_id = p.id
WHERE year(s.published_at) >= 2006
AND year(s.published_at) < 2023
""")
db.query("""
CREATE OR REPLACE TABLE top.related_stories AS
SELECT
r.*
FROM top.stories s
JOIN related_stories r
ON s.id = r.parent_id
""")

86
src/data/sentiment.py Normal file
View File

@@ -0,0 +1,86 @@
import click
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import torch.nn.functional as F
from data.main import connect, paths
import numpy as np
from tqdm import tqdm
import pandas as pd
@click.option('-c', '--chunks', type=int, default=500, show_default=True)
@click.command("sentiment:extract")
def extract(chunks):
device = 'cuda' if torch.cuda.is_available() else 'cpu'
chunks = 1000
# Load model from HuggingFace Hub
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
model = model.to(device)
# load data
DB = connect()
table = DB.sql("""
select
id
,title
from stories
order by id desc
""").df()
DB.close()
# normalize text
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
chunked = np.array_split(table, chunks)
# generate embeddings from list of titles
iterator = tqdm(chunked, 'embedding')
sentiments = []
story_ids = []
for _, chunk in enumerate(iterator):
sentences = chunk['title'].tolist()
ids = chunk['id'].tolist()
# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
# Compute token embeddings
with torch.no_grad():
logits = model(**encoded_input.to(device)).logits
sentiment = logits.argmax(axis=1).tolist()
sentiments.append(sentiment)
story_ids.append(ids)
sentiments = np.concatenate(sentiments)
story_ids = np.concatenate(story_ids)
# save embeddings
save_to = data_dir() / 'sentiment.npy'
np.save(save_to, sentiments)
print(f"sentiments saved: {save_to}")
# save ids
save_to = data_dir() / 'sentiment_ids.npy'
np.save(save_to, story_ids)
print(f"ids saved: {save_to}")
@click.command('sentiment:load')
def load():
sentiments = np.load(paths('data') / 'sentiment.npy')
story_ids = np.load(paths('data') / 'sentiment_ids.npy')
data = pd.DataFrame(story_ids, columns=['story_id']).reset_index()
data['sentiment_id'] = sentiments
with connect() as db:
db.query("""
CREATE OR REPLACE TABLE story_sentiments AS
SELECT
data.story_id
,data.sentiment_id as class_id
,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label
FROM data
JOIN stories s
ON s.id = data.story_id
""")