add mbfc data. use context manager for db. add paths fn.
This commit is contained in:
@@ -1,6 +1,10 @@
|
||||
import data.main
|
||||
import data.scrape
|
||||
import data.factcheck
|
||||
import data.links
|
||||
__all__ = [
|
||||
'main'
|
||||
,'scrape'
|
||||
,'factcheck'
|
||||
,'links'
|
||||
]
|
||||
|
||||
171
src/data/factcheck.py
Normal file
171
src/data/factcheck.py
Normal file
@@ -0,0 +1,171 @@
|
||||
import requests
|
||||
from lxml import etree
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
from io import BytesIO
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
import os
|
||||
import sys
|
||||
import click
|
||||
from data.main import connect, map_tld, paths
|
||||
from random import randint
|
||||
from time import sleep
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
@click.command('mbfc:parse-index')
|
||||
def parse_index():
|
||||
parser = etree.HTMLParser()
|
||||
publishers = []
|
||||
for page in range(1, 54):
|
||||
url = f"https://mediabiasfactcheck.com/filtered-search/?pg={page}"
|
||||
print(f"downloading {url}", file=sys.stderr)
|
||||
response = requests.get(url)
|
||||
html = response.content
|
||||
tree = etree.parse(BytesIO(html), parser)
|
||||
rows = tree.xpath('//table[@class="mbfc-table"]/tbody/tr')
|
||||
print(f"parsing {len(rows)} rows", file=sys.stderr)
|
||||
for row in rows:
|
||||
publisher = {}
|
||||
link, bias, reporting, country, credibility, media_type, traffic, popularity = tuple(col for col in row.iterchildren())
|
||||
link = link.xpath('./a')[0]
|
||||
publisher['name'] = link.text
|
||||
publisher['detail_url'] = link.get('href')
|
||||
publisher['bias'] = bias.text
|
||||
publisher['reporting'] = reporting.text
|
||||
publisher['country'] = country.text
|
||||
publisher['credibility'] = credibility.text
|
||||
publisher['media_type'] = media_type.text
|
||||
publisher['traffic'] = traffic.text
|
||||
publisher['popularity'] = popularity.xpath('./span')[0].text
|
||||
publishers.append(publisher)
|
||||
df = pd.DataFrame(publishers)
|
||||
save_to = paths('data') / 'mbfc_bias.csv'
|
||||
df.to_csv(save_to, sep='|', index=False)
|
||||
print(f"saved {len(df)}: {save_to}", file=sys.stderr)
|
||||
|
||||
@click.command("mbfc:schema")
|
||||
def schema():
|
||||
with connect() as db:
|
||||
db.sql("""create schema mbfc""")
|
||||
db.sql("""create or replace table mbfc.scrape (
|
||||
url text
|
||||
,scraped_at datetime default now()
|
||||
)
|
||||
""")
|
||||
|
||||
@click.command("mbfc:scrape")
|
||||
def scrape():
|
||||
|
||||
df = pd.read_csv(paths('data') / 'mbfc_bias.csv', sep="|")
|
||||
|
||||
with connect() as db:
|
||||
stats = db.query("""
|
||||
select
|
||||
count(1) filter(where s.url is not null) as elapsed
|
||||
,count(1) filter(where s.url is null) as remaining
|
||||
from df
|
||||
left join mbfc.scrape s
|
||||
on df.detail_url = s.url
|
||||
""").fetchall()
|
||||
df = db.query("""
|
||||
select
|
||||
detail_url as url
|
||||
from df
|
||||
where df.detail_url not in (
|
||||
select
|
||||
url
|
||||
from mbfc.scrape
|
||||
)
|
||||
""").df()
|
||||
print(f"{stats[0][0]} elapsed. {stats[0][1]} remaining.")
|
||||
|
||||
for url in df.url:
|
||||
delay = randint(1,3)
|
||||
save_as = paths('data') / 'mbfc' / (url.strip('/').split('/')[-1] + '.html')
|
||||
print(f"downloading (delay: {delay}): {url}", file=sys.stderr)
|
||||
sleep(delay)
|
||||
try:
|
||||
response = requests.get(url)
|
||||
except Exception as e:
|
||||
print(f"request failed: {url}", file=sys.stderr)
|
||||
continue
|
||||
with open(save_as, 'w') as f:
|
||||
f.write(response.text)
|
||||
with connect() as db:
|
||||
db.execute("""insert into mbfc.scrape (url) values (?)""", [url])
|
||||
print(f"saved: {save_as}", file=sys.stderr)
|
||||
|
||||
def load():
|
||||
|
||||
publishers = []
|
||||
for i, page in enumerate(tqdm((paths('data') / 'mbfc').iterdir())):
|
||||
publisher = {}
|
||||
publisher['origin_url'] = f"https://mediabiasfactcheck.com/{page.stem}"
|
||||
with page.open() as p:
|
||||
tree = BeautifulSoup(p, 'html.parser')
|
||||
for e in tree(string=re.compile(r'source:', re.IGNORECASE)):
|
||||
e = e.parent
|
||||
while e.name != 'p':
|
||||
e = e.parent
|
||||
l = e.find('a')
|
||||
if l:
|
||||
publisher['tld'] = l.get('href')
|
||||
break
|
||||
else:
|
||||
breakpoint()
|
||||
publishers.append(publisher)
|
||||
df = pd.DataFrame(publishers)
|
||||
df.to_csv(paths('data') / 'mbfc_publisher_url.csv', index=False, sep="|")
|
||||
|
||||
@click.command('mbfc:create-tables')
|
||||
def create_tables():
|
||||
|
||||
pubs = pd.read_csv(paths('data') / 'mbfc_publishers.csv', sep='|')
|
||||
urls = pd.read_csv(paths('data') / 'mbfc_publisher_url.csv', sep="|")
|
||||
df = pubs.merge(urls, on='mbfc_url')
|
||||
df['tld'] = df.tld.apply(map_tld)
|
||||
df['ordinal'] = df.bias.apply(bias_label_to_int)
|
||||
|
||||
with connect() as db:
|
||||
db.sql("""
|
||||
CREATE OR REPLACE TABLE mbfc.publishers AS
|
||||
SELECT
|
||||
row_number() over() as id
|
||||
,p.tld
|
||||
,mode(p.name) as name
|
||||
,mode(p.bias) as bias
|
||||
,mode(p.ordinal) as ordinal
|
||||
,mode(p.reporting) as reporting
|
||||
,mode(p.country) as country
|
||||
,mode(p.credibility) as credibility
|
||||
,mode(p.media_type) as media_type
|
||||
,mode(p.traffic) as traffic
|
||||
,mode(p.popularity) as popularity
|
||||
FROM df p
|
||||
GROUP BY
|
||||
p.tld
|
||||
""")
|
||||
|
||||
with connect() as db:
|
||||
raw_stories = db.sql("""
|
||||
SELECT
|
||||
*
|
||||
FROM stories s
|
||||
""").df()
|
||||
|
||||
stories['tld'] = stories.url.apply(map_tld)
|
||||
|
||||
with connect() as db:
|
||||
db.sql("""
|
||||
CREATE OR REPLACE TABLE mbfc.publisher_stories AS
|
||||
SELECT
|
||||
s.id as story_id
|
||||
,p.id as publisher_id
|
||||
FROM raw_stories s
|
||||
JOIN mbfc.publishers p
|
||||
ON p.tld = s.tld
|
||||
""")
|
||||
|
||||
|
||||
135
src/data/links.py
Normal file
135
src/data/links.py
Normal file
@@ -0,0 +1,135 @@
|
||||
import click
|
||||
from data.main import connect
|
||||
import pandas as pd
|
||||
|
||||
@click.command('links:create-table')
|
||||
def create_table():
|
||||
|
||||
with connect() as db:
|
||||
db.query(f"""
|
||||
CREATE OR REPLACE TABLE link_edges AS
|
||||
with cte as(
|
||||
SELECT
|
||||
s.publisher_id as parent_id
|
||||
,r.publisher_id as child_id
|
||||
,count(1) as links
|
||||
FROM stories s
|
||||
JOIN related_stories r
|
||||
ON s.id = r.parent_id
|
||||
group by
|
||||
s.publisher_id
|
||||
,r.publisher_id
|
||||
)
|
||||
SELECT
|
||||
cte.parent_id
|
||||
,cte.child_id
|
||||
,cte.links as links
|
||||
,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
|
||||
,case when cte.links > 0 then 1 else 0 end as onehot
|
||||
FROM cte
|
||||
WHERE cte.child_id in (
|
||||
SELECT
|
||||
distinct parent_id
|
||||
FROM cte
|
||||
)
|
||||
AND cte.parent_id in (
|
||||
SELECT
|
||||
distinct child_id
|
||||
FROM cte
|
||||
)
|
||||
""")
|
||||
|
||||
db.query("""
|
||||
SELECT
|
||||
*
|
||||
,count(1) over()
|
||||
FROM link_edges e
|
||||
limit 1
|
||||
""")
|
||||
|
||||
print(f"created link_edges")
|
||||
|
||||
@click.command('links:create-pca')
|
||||
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
||||
def create_pca(source):
|
||||
"""create 2D pca labels"""
|
||||
from sklearn.decomposition import PCA
|
||||
|
||||
table_name = f"publisher_pca_{source}"
|
||||
|
||||
with connect() as db:
|
||||
pub = db.query("""
|
||||
SELECT
|
||||
p.*
|
||||
FROM mbfc.publishers p
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON p.id = ps.publisher_id
|
||||
""").df()
|
||||
df = db.query(f"""
|
||||
SELECT
|
||||
parent_id
|
||||
,child_id
|
||||
,{source} as links
|
||||
FROM link_edges
|
||||
""").df()
|
||||
|
||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
svd = PCA(n_components=2)
|
||||
svd_out = svd.fit_transform(pivot)
|
||||
out = pivot.reset_index()[['parent_id']]
|
||||
out['first'] = svd_out[:, 0]
|
||||
out['second'] = svd_out[:, 1]
|
||||
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
|
||||
|
||||
with connect() as db:
|
||||
db.query(f"""
|
||||
CREATE OR REPLACE TABLE {table_name} AS
|
||||
SELECT
|
||||
out.id as publisher_id
|
||||
,out.first as first
|
||||
,out.second as second
|
||||
FROM out
|
||||
""")
|
||||
|
||||
print(f"created {table_name}")
|
||||
|
||||
|
||||
@click.command('links:create-clusters')
|
||||
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
||||
def create_clusters(source):
|
||||
"""create link adj. matrix clusters table"""
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
table_name = f"publisher_clusters_{source}"
|
||||
with connect() as db:
|
||||
df = db.query(f"""
|
||||
SELECT
|
||||
parent_id
|
||||
,child_id
|
||||
,{source} as links
|
||||
FROM link_edges
|
||||
""").df()
|
||||
pub = db.query("""
|
||||
SELECT
|
||||
p.*
|
||||
FROM mbfc.publishers p
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.publisher_id = p.id
|
||||
""").df()
|
||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
k = 8
|
||||
kmeans = KMeans(n_clusters=k, n_init="auto")
|
||||
pred = kmeans.fit_predict(pivot)
|
||||
out = pivot.reset_index()[['parent_id']]
|
||||
out['label'] = pred
|
||||
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
|
||||
new_table = out[['id', 'label']]
|
||||
with connect() as db:
|
||||
db.query(f"""
|
||||
CREATE OR REPLACE TABLE {table_name} AS
|
||||
SELECT
|
||||
n.id as publisher_id
|
||||
,n.label as label
|
||||
FROM new_table n
|
||||
""")
|
||||
print(f"created {table_name}")
|
||||
@@ -2,6 +2,10 @@ import os
|
||||
from pathlib import Path
|
||||
import duckdb
|
||||
from enum import Enum
|
||||
from urllib.parse import urlparse
|
||||
from tld import get_tld
|
||||
from tld.utils import update_tld_names
|
||||
import sys
|
||||
|
||||
class Data(str, Enum):
|
||||
Titles = 'titles'
|
||||
@@ -9,6 +13,16 @@ class Data(str, Enum):
|
||||
def data_dir():
|
||||
return Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
|
||||
def paths(name='app'):
|
||||
if 'app' in name:
|
||||
return Path(os.environ['DATA_MINING_APP_DIR'])
|
||||
if 'data' in name:
|
||||
return Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
if 'doc' in name:
|
||||
return Path(os.environ['DATA_MINING_DOCS_DIR'])
|
||||
if 'figure' in name:
|
||||
return Path(os.environ['DATA_MINING_DOCS_DIR']) / 'figures'
|
||||
|
||||
def connect():
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
# APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
|
||||
@@ -28,3 +42,66 @@ def from_db(t: Data):
|
||||
limit 100
|
||||
""").df()
|
||||
return table
|
||||
|
||||
def map_tld(x):
|
||||
try:
|
||||
res = get_tld(x, as_object=True)
|
||||
return res.fld
|
||||
except:
|
||||
print(f"'{x}' is not valid.", file=sys.stderr)
|
||||
return None
|
||||
|
||||
def ticklabels():
|
||||
return [
|
||||
'Left',
|
||||
'Left-Center',
|
||||
'Least Biased',
|
||||
'Right-Center',
|
||||
'Right',
|
||||
]
|
||||
|
||||
def bias_label_to_int(rating:str, source: str = 'mbfc') -> int:
|
||||
if source == 'mbfc':
|
||||
mapping = {
|
||||
'Left' : 0,
|
||||
'Left-Center' : 1,
|
||||
'Least Biased' : 2,
|
||||
'Right-Center' : 3,
|
||||
'Right' : 4,
|
||||
}
|
||||
else:
|
||||
mapping = {
|
||||
'left' : 0,
|
||||
'left-center' : 1,
|
||||
'center' : 2,
|
||||
'right-center' : 3,
|
||||
'right' : 4,
|
||||
}
|
||||
try:
|
||||
return mapping[rating]
|
||||
except:
|
||||
print(f"no mapping for {rating}", file=sys.stderr)
|
||||
return -1
|
||||
|
||||
def bias_int_to_label(class_id: int, source: str = 'mbfc') -> str:
|
||||
if source == 'mbfc':
|
||||
mapping = {
|
||||
0 : 'Left',
|
||||
1 : 'Left-Center',
|
||||
2 : 'Least Biased',
|
||||
3 : 'Right-Center',
|
||||
4 : 'Right',
|
||||
}
|
||||
else:
|
||||
mapping = {
|
||||
0 : 'left',
|
||||
1 : 'left-center',
|
||||
2 : 'center',
|
||||
3 : 'right-center',
|
||||
4 : 'right',
|
||||
}
|
||||
try:
|
||||
return mapping[class_id]
|
||||
except:
|
||||
print(f"no mapping for {class_id}", file=sys.stderr)
|
||||
return -1
|
||||
|
||||
@@ -319,12 +319,6 @@ def another_norm():
|
||||
""")
|
||||
|
||||
|
||||
def map_tld(x):
|
||||
try:
|
||||
res = get_tld(x, as_object=True)
|
||||
return res.fld
|
||||
except:
|
||||
return None
|
||||
|
||||
DB.sql("""
|
||||
SELECT
|
||||
|
||||
47
src/data/selection.py
Normal file
47
src/data/selection.py
Normal file
@@ -0,0 +1,47 @@
|
||||
from data.main import connect
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
def create_tables():
|
||||
|
||||
with connect() as db:
|
||||
edges = db.query("""
|
||||
select
|
||||
*
|
||||
from link_edges
|
||||
""").df()
|
||||
|
||||
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
|
||||
|
||||
with connect() as db:
|
||||
db.query("create schema top")
|
||||
|
||||
db.query("""
|
||||
CREATE OR REPLACE TABLE top.publishers AS
|
||||
SELECT
|
||||
p.*
|
||||
FROM publishers p
|
||||
JOIN select_publishers s
|
||||
ON s.publisher_id = p.id
|
||||
""")
|
||||
|
||||
db.query("""
|
||||
CREATE OR REPLACE TABLE top.stories AS
|
||||
SELECT
|
||||
s.*
|
||||
FROM stories s
|
||||
JOIN top.publishers p
|
||||
ON s.publisher_id = p.id
|
||||
WHERE year(s.published_at) >= 2006
|
||||
AND year(s.published_at) < 2023
|
||||
""")
|
||||
|
||||
db.query("""
|
||||
CREATE OR REPLACE TABLE top.related_stories AS
|
||||
SELECT
|
||||
r.*
|
||||
FROM top.stories s
|
||||
JOIN related_stories r
|
||||
ON s.id = r.parent_id
|
||||
""")
|
||||
86
src/data/sentiment.py
Normal file
86
src/data/sentiment.py
Normal file
@@ -0,0 +1,86 @@
|
||||
import click
|
||||
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from data.main import connect, paths
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
|
||||
@click.option('-c', '--chunks', type=int, default=500, show_default=True)
|
||||
@click.command("sentiment:extract")
|
||||
def extract(chunks):
|
||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
chunks = 1000
|
||||
|
||||
# Load model from HuggingFace Hub
|
||||
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
|
||||
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
|
||||
model = model.to(device)
|
||||
|
||||
|
||||
# load data
|
||||
DB = connect()
|
||||
table = DB.sql("""
|
||||
select
|
||||
id
|
||||
,title
|
||||
from stories
|
||||
order by id desc
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
# normalize text
|
||||
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
|
||||
|
||||
|
||||
chunked = np.array_split(table, chunks)
|
||||
|
||||
# generate embeddings from list of titles
|
||||
iterator = tqdm(chunked, 'embedding')
|
||||
sentiments = []
|
||||
story_ids = []
|
||||
for _, chunk in enumerate(iterator):
|
||||
sentences = chunk['title'].tolist()
|
||||
ids = chunk['id'].tolist()
|
||||
# Tokenize sentences
|
||||
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
||||
# Compute token embeddings
|
||||
with torch.no_grad():
|
||||
logits = model(**encoded_input.to(device)).logits
|
||||
sentiment = logits.argmax(axis=1).tolist()
|
||||
sentiments.append(sentiment)
|
||||
story_ids.append(ids)
|
||||
|
||||
sentiments = np.concatenate(sentiments)
|
||||
story_ids = np.concatenate(story_ids)
|
||||
|
||||
# save embeddings
|
||||
save_to = data_dir() / 'sentiment.npy'
|
||||
np.save(save_to, sentiments)
|
||||
print(f"sentiments saved: {save_to}")
|
||||
|
||||
# save ids
|
||||
save_to = data_dir() / 'sentiment_ids.npy'
|
||||
np.save(save_to, story_ids)
|
||||
print(f"ids saved: {save_to}")
|
||||
|
||||
@click.command('sentiment:load')
|
||||
def load():
|
||||
|
||||
sentiments = np.load(paths('data') / 'sentiment.npy')
|
||||
story_ids = np.load(paths('data') / 'sentiment_ids.npy')
|
||||
data = pd.DataFrame(story_ids, columns=['story_id']).reset_index()
|
||||
data['sentiment_id'] = sentiments
|
||||
|
||||
with connect() as db:
|
||||
db.query("""
|
||||
CREATE OR REPLACE TABLE story_sentiments AS
|
||||
SELECT
|
||||
data.story_id
|
||||
,data.sentiment_id as class_id
|
||||
,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label
|
||||
FROM data
|
||||
JOIN stories s
|
||||
ON s.id = data.story_id
|
||||
""")
|
||||
Reference in New Issue
Block a user