add mbfc data. use context manager for db. add paths fn.

This commit is contained in:
matt 2023-06-01 09:44:01 -07:00
parent 398228f02c
commit 79808f01d6
40 changed files with 1354 additions and 1137 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 21 KiB

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 148 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 235 KiB

After

Width:  |  Height:  |  Size: 73 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 202 KiB

After

Width:  |  Height:  |  Size: 104 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 44 KiB

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 30 KiB

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 87 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 51 KiB

After

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 128 KiB

After

Width:  |  Height:  |  Size: 128 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 22 KiB

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB

After

Width:  |  Height:  |  Size: 53 KiB

27
src/apriori.py Normal file
View File

@ -0,0 +1,27 @@
from efficient_apriori import apriori
from data.main import connect
@click.command("apriori:rules")
def rules():
DB = connect()
data = DB.query("""
SELECT
--list_prepend(parent.id, list(child.id)) as transaction
list_prepend(parent.tld, list(child.tld)) as transaction
FROM stories s
JOIN related_stories r
ON r.parent_id = s.id
JOIN publishers parent
ON parent.id = s.publisher_id
JOIN publishers child
ON child.id = r.publisher_id
GROUP BY
--parent.id
parent.tld
""").df()
DB.close()
transactions = data.transaction.apply(lambda x: tuple(x)).values
itemsets, rules = apriori(transactions, min_support=0.1, min_confidence=0.8)
print(*rules, sep="\n")

View File

@ -1,41 +1,16 @@
import click
from data.main import connect
from data.main import connect, paths
import pandas as pd
from lxml import etree
from pathlib import Path
import os
import csv
def label_to_int(rating:str) -> int:
mapping = {
'left' : 0,
'left-center' : 1,
'center' : 2,
'right-center' : 3,
'right' : 4,
'allsides' : -1,
}
return mapping[rating]
def int_to_label(class_id: int) -> str:
mapping = {
0 : 'left',
1 : 'left-center',
2 : 'center',
3 : 'right-center',
4 : 'right',
-1 : 'allsides',
}
return mapping[class_id]
@click.command(name="bias:normalize")
def normalize() -> None:
DB = connect()
DB.sql("""
with connect() as db:
db.sql("""
CREATE OR REPLACE TABLE publisher_bias AS
WITH cte AS (
SELECT
@ -72,9 +47,9 @@ def normalize() -> None:
]
mapping = pd.DataFrame(mapping)
DB.query("alter table bias_ratings add column ordinal int")
DB.query("""
with connect() as db:
db.query("alter table bias_ratings add column ordinal int")
db.query("""
update bias_ratings b
set ordinal = o.ordinal
FROM mapping o
@ -85,9 +60,7 @@ def normalize() -> None:
@click.command(name='bias:parse')
def parse() -> None:
"""parse the save html page of allslides.com bias ratings into a normalized csv file"""
DB = connect()
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
bias_html = DATA_DIR / 'allsides.html'
bias_html = paths('data') / 'allsides.html'
parser = etree.HTMLParser()
tree = etree.parse(str(bias_html), parser)
@ -111,15 +84,14 @@ def parse() -> None:
rating['disagree'] = int(disagree)
ratings.append(rating)
df = pd.DataFrame(ratings)
df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
df.to_csv(paths('data') / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
@click.command(name="bias:load")
def load() -> None:
DB = connect()
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
f = str(DATA_DIR / "bias_ratings.csv")
f = str(paths('data') / "bias_ratings.csv")
DB.sql(f"""
with connect() as db:
db.sql(f"""
CREATE TABLE bias_ratings as
select
row_number() over(order by b.publisher) as id
@ -129,10 +101,8 @@ def load() -> None:
@click.command('bias:export')
def export():
data_path = Path(os.environ['DATA_MINING_DATA_DIR'])
DB = connect()
all_bias = DB.query("""
with connect() as db:
all_bias = db.query("""
SELECT
id as bias_id
,publisher as name
@ -140,8 +110,10 @@ def export():
FROM bias_ratings
ORDER by agree desc
""")
all_bias.df().to_csv(data_path / 'TMP_publisher_bias.csv', sep="|", index=False)
mapped_bias = DB.query("""
all_bias.df().to_csv(paths('data') / 'TMP_publisher_bias.csv', sep="|", index=False)
with connect() as db:
mapped_bias = db.query("""
SELECT
p.id as publisher_id
,p.name as name
@ -152,18 +124,16 @@ def export():
LEFT JOIN publisher_bias b
ON b.publisher_id = p.id
""")
mapped_bias.df().to_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|", index=False)
DB.close()
mapped_bias.df().to_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|", index=False)
@click.command('bias:import-mapped')
def import_mapped():
data_path = Path(os.environ['DATA_MINING_DATA_DIR'])
table_name = "top.publisher_bias"
DB = connect()
df = pd.read_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|")
df = pd.read_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|")
DB.query(f"""
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
publisher_id AS publisher_id
@ -171,5 +141,6 @@ def import_mapped():
FROM df
WHERE bias_id IS NOT NULL
""")
print(f"created table: {table_name}")

View File

@ -1,5 +1,7 @@
import click
from dotenv import load_dotenv
import data
import plots
@click.group()
def cli():
@ -7,12 +9,20 @@ def cli():
if __name__ == "__main__":
load_dotenv()
from data import scrape
cli.add_command(scrape.download)
cli.add_command(scrape.parse)
cli.add_command(scrape.load)
cli.add_command(scrape.normalize)
cli.add_command(scrape.create_elections_table)
# original bias ratings
cli.add_command(data.scrape.download)
cli.add_command(data.scrape.parse)
cli.add_command(data.scrape.load)
cli.add_command(data.scrape.normalize)
cli.add_command(data.scrape.create_elections_table)
cli.add_command(data.factcheck.parse_index)
cli.add_command(data.factcheck.scrape)
cli.add_command(data.links.create_table)
cli.add_command(data.links.create_pca)
cli.add_command(data.links.create_clusters)
import word
# cli.add_command(word.distance)
@ -23,10 +33,12 @@ if __name__ == "__main__":
cli.add_command(bias.parse)
cli.add_command(bias.load)
cli.add_command(bias.normalize)
import mine
cli.add_command(mine.embeddings)
cli.add_command(mine.cluster)
cli.add_command(mine.plot)
import emotion
cli.add_command(emotion.extract)
cli.add_command(emotion.normalize)
@ -40,34 +52,20 @@ if __name__ == "__main__":
from train import main as train_main
cli.add_command(train_main.main)
import plots.descriptive as plotd
cli.add_command(plotd.articles_per_year)
cli.add_command(plotd.distinct_publishers)
cli.add_command(plotd.stories_per_publisher)
cli.add_command(plotd.top_publishers)
cli.add_command(plotd.common_tld)
import links as linkcli
cli.add_command(linkcli.create_table)
cli.add_command(linkcli.create_pca)
cli.add_command(linkcli.create_clusters)
import plots.links as plotl
cli.add_command(plotl.elbow)
cli.add_command(plotl.link_pca_clusters)
import plots.classifier as plotc
cli.add_command(plotc.pca_with_classes)
import plots
cli.add_command(plots.descriptive.articles_per_year)
cli.add_command(plots.descriptive.distinct_publishers)
cli.add_command(plots.descriptive.stories_per_publisher)
cli.add_command(plots.descriptive.top_publishers)
cli.add_command(plots.descriptive.common_tld)
cli.add_command(plots.sentence.sentence_pca)
cli.add_command(plots.sentence.avg_sentence_pca)
cli.add_command(plots.emotion.emotion_over_time)
cli.add_command(plots.emotion.emotion_regression)
cli.add_command(plots.sentiment.over_time)
cli.add_command(plots.sentiment.bias_over_time)
cli.add_command(plots.sentiment.bias_vs_recent_winner)
cli.add_command(plots.links.elbow)
cli.add_command(plots.links.link_pca_clusters)
cli.add_command(plots.classifier.pca_with_classes)
cli()

View File

@ -1,6 +1,10 @@
import data.main
import data.scrape
import data.factcheck
import data.links
__all__ = [
'main'
,'scrape'
,'factcheck'
,'links'
]

171
src/data/factcheck.py Normal file
View File

@ -0,0 +1,171 @@
import requests
from lxml import etree
from bs4 import BeautifulSoup
import re
from io import BytesIO
import pandas as pd
from pathlib import Path
import os
import sys
import click
from data.main import connect, map_tld, paths
from random import randint
from time import sleep
from tqdm import tqdm
@click.command('mbfc:parse-index')
def parse_index():
parser = etree.HTMLParser()
publishers = []
for page in range(1, 54):
url = f"https://mediabiasfactcheck.com/filtered-search/?pg={page}"
print(f"downloading {url}", file=sys.stderr)
response = requests.get(url)
html = response.content
tree = etree.parse(BytesIO(html), parser)
rows = tree.xpath('//table[@class="mbfc-table"]/tbody/tr')
print(f"parsing {len(rows)} rows", file=sys.stderr)
for row in rows:
publisher = {}
link, bias, reporting, country, credibility, media_type, traffic, popularity = tuple(col for col in row.iterchildren())
link = link.xpath('./a')[0]
publisher['name'] = link.text
publisher['detail_url'] = link.get('href')
publisher['bias'] = bias.text
publisher['reporting'] = reporting.text
publisher['country'] = country.text
publisher['credibility'] = credibility.text
publisher['media_type'] = media_type.text
publisher['traffic'] = traffic.text
publisher['popularity'] = popularity.xpath('./span')[0].text
publishers.append(publisher)
df = pd.DataFrame(publishers)
save_to = paths('data') / 'mbfc_bias.csv'
df.to_csv(save_to, sep='|', index=False)
print(f"saved {len(df)}: {save_to}", file=sys.stderr)
@click.command("mbfc:schema")
def schema():
with connect() as db:
db.sql("""create schema mbfc""")
db.sql("""create or replace table mbfc.scrape (
url text
,scraped_at datetime default now()
)
""")
@click.command("mbfc:scrape")
def scrape():
df = pd.read_csv(paths('data') / 'mbfc_bias.csv', sep="|")
with connect() as db:
stats = db.query("""
select
count(1) filter(where s.url is not null) as elapsed
,count(1) filter(where s.url is null) as remaining
from df
left join mbfc.scrape s
on df.detail_url = s.url
""").fetchall()
df = db.query("""
select
detail_url as url
from df
where df.detail_url not in (
select
url
from mbfc.scrape
)
""").df()
print(f"{stats[0][0]} elapsed. {stats[0][1]} remaining.")
for url in df.url:
delay = randint(1,3)
save_as = paths('data') / 'mbfc' / (url.strip('/').split('/')[-1] + '.html')
print(f"downloading (delay: {delay}): {url}", file=sys.stderr)
sleep(delay)
try:
response = requests.get(url)
except Exception as e:
print(f"request failed: {url}", file=sys.stderr)
continue
with open(save_as, 'w') as f:
f.write(response.text)
with connect() as db:
db.execute("""insert into mbfc.scrape (url) values (?)""", [url])
print(f"saved: {save_as}", file=sys.stderr)
def load():
publishers = []
for i, page in enumerate(tqdm((paths('data') / 'mbfc').iterdir())):
publisher = {}
publisher['origin_url'] = f"https://mediabiasfactcheck.com/{page.stem}"
with page.open() as p:
tree = BeautifulSoup(p, 'html.parser')
for e in tree(string=re.compile(r'source:', re.IGNORECASE)):
e = e.parent
while e.name != 'p':
e = e.parent
l = e.find('a')
if l:
publisher['tld'] = l.get('href')
break
else:
breakpoint()
publishers.append(publisher)
df = pd.DataFrame(publishers)
df.to_csv(paths('data') / 'mbfc_publisher_url.csv', index=False, sep="|")
@click.command('mbfc:create-tables')
def create_tables():
pubs = pd.read_csv(paths('data') / 'mbfc_publishers.csv', sep='|')
urls = pd.read_csv(paths('data') / 'mbfc_publisher_url.csv', sep="|")
df = pubs.merge(urls, on='mbfc_url')
df['tld'] = df.tld.apply(map_tld)
df['ordinal'] = df.bias.apply(bias_label_to_int)
with connect() as db:
db.sql("""
CREATE OR REPLACE TABLE mbfc.publishers AS
SELECT
row_number() over() as id
,p.tld
,mode(p.name) as name
,mode(p.bias) as bias
,mode(p.ordinal) as ordinal
,mode(p.reporting) as reporting
,mode(p.country) as country
,mode(p.credibility) as credibility
,mode(p.media_type) as media_type
,mode(p.traffic) as traffic
,mode(p.popularity) as popularity
FROM df p
GROUP BY
p.tld
""")
with connect() as db:
raw_stories = db.sql("""
SELECT
*
FROM stories s
""").df()
stories['tld'] = stories.url.apply(map_tld)
with connect() as db:
db.sql("""
CREATE OR REPLACE TABLE mbfc.publisher_stories AS
SELECT
s.id as story_id
,p.id as publisher_id
FROM raw_stories s
JOIN mbfc.publishers p
ON p.tld = s.tld
""")

135
src/data/links.py Normal file
View File

@ -0,0 +1,135 @@
import click
from data.main import connect
import pandas as pd
@click.command('links:create-table')
def create_table():
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE link_edges AS
with cte as(
SELECT
s.publisher_id as parent_id
,r.publisher_id as child_id
,count(1) as links
FROM stories s
JOIN related_stories r
ON s.id = r.parent_id
group by
s.publisher_id
,r.publisher_id
)
SELECT
cte.parent_id
,cte.child_id
,cte.links as links
,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
,case when cte.links > 0 then 1 else 0 end as onehot
FROM cte
WHERE cte.child_id in (
SELECT
distinct parent_id
FROM cte
)
AND cte.parent_id in (
SELECT
distinct child_id
FROM cte
)
""")
db.query("""
SELECT
*
,count(1) over()
FROM link_edges e
limit 1
""")
print(f"created link_edges")
@click.command('links:create-pca')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_pca(source):
"""create 2D pca labels"""
from sklearn.decomposition import PCA
table_name = f"publisher_pca_{source}"
with connect() as db:
pub = db.query("""
SELECT
p.*
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON p.id = ps.publisher_id
""").df()
df = db.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM link_edges
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
svd = PCA(n_components=2)
svd_out = svd.fit_transform(pivot)
out = pivot.reset_index()[['parent_id']]
out['first'] = svd_out[:, 0]
out['second'] = svd_out[:, 1]
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
out.id as publisher_id
,out.first as first
,out.second as second
FROM out
""")
print(f"created {table_name}")
@click.command('links:create-clusters')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_clusters(source):
"""create link adj. matrix clusters table"""
from sklearn.cluster import KMeans
table_name = f"publisher_clusters_{source}"
with connect() as db:
df = db.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM link_edges
""").df()
pub = db.query("""
SELECT
p.*
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
k = 8
kmeans = KMeans(n_clusters=k, n_init="auto")
pred = kmeans.fit_predict(pivot)
out = pivot.reset_index()[['parent_id']]
out['label'] = pred
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
new_table = out[['id', 'label']]
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
n.id as publisher_id
,n.label as label
FROM new_table n
""")
print(f"created {table_name}")

View File

@ -2,6 +2,10 @@ import os
from pathlib import Path
import duckdb
from enum import Enum
from urllib.parse import urlparse
from tld import get_tld
from tld.utils import update_tld_names
import sys
class Data(str, Enum):
Titles = 'titles'
@ -9,6 +13,16 @@ class Data(str, Enum):
def data_dir():
return Path(os.environ['DATA_MINING_DATA_DIR'])
def paths(name='app'):
if 'app' in name:
return Path(os.environ['DATA_MINING_APP_DIR'])
if 'data' in name:
return Path(os.environ['DATA_MINING_DATA_DIR'])
if 'doc' in name:
return Path(os.environ['DATA_MINING_DOCS_DIR'])
if 'figure' in name:
return Path(os.environ['DATA_MINING_DOCS_DIR']) / 'figures'
def connect():
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
# APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
@ -28,3 +42,66 @@ def from_db(t: Data):
limit 100
""").df()
return table
def map_tld(x):
try:
res = get_tld(x, as_object=True)
return res.fld
except:
print(f"'{x}' is not valid.", file=sys.stderr)
return None
def ticklabels():
return [
'Left',
'Left-Center',
'Least Biased',
'Right-Center',
'Right',
]
def bias_label_to_int(rating:str, source: str = 'mbfc') -> int:
if source == 'mbfc':
mapping = {
'Left' : 0,
'Left-Center' : 1,
'Least Biased' : 2,
'Right-Center' : 3,
'Right' : 4,
}
else:
mapping = {
'left' : 0,
'left-center' : 1,
'center' : 2,
'right-center' : 3,
'right' : 4,
}
try:
return mapping[rating]
except:
print(f"no mapping for {rating}", file=sys.stderr)
return -1
def bias_int_to_label(class_id: int, source: str = 'mbfc') -> str:
if source == 'mbfc':
mapping = {
0 : 'Left',
1 : 'Left-Center',
2 : 'Least Biased',
3 : 'Right-Center',
4 : 'Right',
}
else:
mapping = {
0 : 'left',
1 : 'left-center',
2 : 'center',
3 : 'right-center',
4 : 'right',
}
try:
return mapping[class_id]
except:
print(f"no mapping for {class_id}", file=sys.stderr)
return -1

View File

@ -319,12 +319,6 @@ def another_norm():
""")
def map_tld(x):
try:
res = get_tld(x, as_object=True)
return res.fld
except:
return None
DB.sql("""
SELECT

47
src/data/selection.py Normal file
View File

@ -0,0 +1,47 @@
from data.main import connect
import pandas as pd
import numpy as np
def create_tables():
with connect() as db:
edges = db.query("""
select
*
from link_edges
""").df()
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
with connect() as db:
db.query("create schema top")
db.query("""
CREATE OR REPLACE TABLE top.publishers AS
SELECT
p.*
FROM publishers p
JOIN select_publishers s
ON s.publisher_id = p.id
""")
db.query("""
CREATE OR REPLACE TABLE top.stories AS
SELECT
s.*
FROM stories s
JOIN top.publishers p
ON s.publisher_id = p.id
WHERE year(s.published_at) >= 2006
AND year(s.published_at) < 2023
""")
db.query("""
CREATE OR REPLACE TABLE top.related_stories AS
SELECT
r.*
FROM top.stories s
JOIN related_stories r
ON s.id = r.parent_id
""")

View File

@ -1,10 +1,11 @@
import click
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import torch.nn.functional as F
from data import connect, data_dir
from data.main import connect, paths
import numpy as np
from tqdm import tqdm
import click
import pandas as pd
@click.option('-c', '--chunks', type=int, default=500, show_default=True)
@click.command("sentiment:extract")
@ -67,20 +68,19 @@ def extract(chunks):
@click.command('sentiment:load')
def load():
DB = connect()
sentiments = np.load(data_dir() / 'sentiment.npy')
story_ids = np.load(data_dir() / 'sentiment_ids.npy')
sentiments = np.load(paths('data') / 'sentiment.npy')
story_ids = np.load(paths('data') / 'sentiment_ids.npy')
data = pd.DataFrame(story_ids, columns=['story_id']).reset_index()
data['sentiment_id'] = sentiments
DB.query("""
CREATE OR REPLACE TABLE top.story_sentiments AS
with connect() as db:
db.query("""
CREATE OR REPLACE TABLE story_sentiments AS
SELECT
data.story_id
,data.sentiment_id as class_id
,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label
FROM data
JOIN top.stories s
JOIN stories s
ON s.id = data.story_id
""")
DB.close()

View File

@ -1,255 +0,0 @@
import click
from data.main import connect
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
@click.command('links:create-table')
def create_table():
table_name = "top.link_edges"
DB = connect()
DB.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
with cte as(
SELECT
s.publisher_id as parent_id
,r.publisher_id as child_id
,count(1) as links
FROM top.stories s
JOIN top.related_stories r
ON s.id = r.parent_id
group by
s.publisher_id
,r.publisher_id
)
SELECT
cte.parent_id
,cte.child_id
,cte.links as links
,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
,case when cte.links > 0 then 1 else 0 end as onehot
FROM cte
WHERE cte.child_id in (
SELECT
distinct parent_id
FROM cte
)
AND cte.parent_id in (
SELECT
distinct child_id
FROM cte
)
""")
DB.close()
DB = connect()
DB.query("""
SELECT
*
,-log10(links)
--distinct parent_id
FROM top.link_edges e
WHERE e.parent_id = 238
""")
DB.close()
print(f"created {table_name}")
@click.command('links:create-pca')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_pca(source):
"""create 2D pca labels"""
from sklearn.decomposition import PCA
table_name = f"top.publisher_pca_{source}"
DB = connect()
pub = DB.query("""
SELECT
*
FROM top.publishers
""").df()
df = DB.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM top.link_edges
""").df()
DB.close()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
svd = PCA(n_components=2)
svd_out = svd.fit_transform(pivot)
out = pivot.reset_index()[['parent_id']]
out['first'] = svd_out[:, 0]
out['second'] = svd_out[:, 1]
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
DB = connect()
DB.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
out.id as publisher_id
,out.first as first
,out.second as second
FROM out
""")
DB.close()
print(f"created {table_name}")
@click.command('links:create-clusters')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_clusters(source):
from sklearn.cluster import KMeans
table_name = f"top.publisher_clusters_{source}"
DB = connect()
df = DB.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM top.link_edges
""").df()
pub = DB.query("""
SELECT
*
FROM top.publishers
""").df()
DB.close()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
k = 8
kmeans = KMeans(n_clusters=k, n_init="auto")
pred = kmeans.fit_predict(pivot)
out = pivot.reset_index()[['parent_id']]
out['label'] = pred
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
new_table = out[['id', 'label']]
DB = connect()
DB.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
n.id as publisher_id
,n.label as label
FROM new_table n
""")
DB.close()
print(f"created {table_name}")
def to_matrix():
"""returns an adjacency matrix of publishers to publisher link frequency"""
DB = connect()
bias_map = pd.DataFrame([
{'label' :'left', 'value' : 0},
{'label' :'left-center', 'value' : 1},
{'label' :'center', 'value' : 2},
{'label' :'right-center', 'value' : 3},
{'label' :'right', 'value' : 4},
{'label' :'allsides', 'value' : -1},
])
bias = DB.sql("""
SELECT
b.id
,b.label
,m.value
FROM publisher_bias b
JOIN bias_map m
ON b.label = m.label
WHERE value != -1
""").df()
pub = DB.sql("""
select
p.id
,p.name
,p.url
from publishers p
""").df()
edges = DB.sql("""
WITH total as (
SELECT
s.publisher_id as id
,COUNT(1) as stories
FROM stories s
GROUP BY
s.publisher_id
), p as (
SELECT
p.id
,stories
FROM publishers p
LEFT JOIN total t
ON t.id = p.id
WHERE t.stories >= 20
), cte as (
SELECT
r.publisher_id as child_id
,s.publisher_id as parent_id
,count(1) as links
FROM related_stories r
JOIN stories s
ON s.id = r.parent_id
group by
s.publisher_id
,r.publisher_id
)
SELECT
p.id as parent_id
,cte.child_id
,links
FROM p
left JOIN cte
ON p.id = cte.parent_id
""").df()
# only keep values that have more than 1 link
test = edges[edges['links'] > 2].pivot(index='parent_id', columns='child_id', values='links').fillna(0).reset_index()
edges.dropna().pivot(index='parent_id', columns='child_id', values='links').fillna(0)
pd.merge(adj, pub, how='left', left_on='parent_id', right_on='id')
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
adj.values.shape
out = pd.DataFrame(adj.index.values, columns=['id'])
out = pd.merge(out, pub, how='left', on='id')
return out
@click.command('links:analysis')
def analysis():
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import MiniBatchKMeans
adj = to_matrix()
pca = PCA(n_components=4)
pca_out = pca.fit_transform(adj)
svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
svd_out = svd.fit_transform(adj)
x = svd_out[:, 0]
y = svd_out[:, 1]
x = pca_out[:, 0]
y = pca_out[:, 1]
sns.scatterplot(x=x, y=y)
plt.show()
kmeans = MiniBatchKMeans(n_clusters=5, random_state=0, batch_size=6, n_init="auto")
pred = kmeans.fit_predict(pca_out)
sns.scatterplot(x=x, y=y, hue=pred)
plt.show()
sns.scatterplot(x=pub['first'], y=pub['second'], hue=pub['bias'])
plt.show()

View File

@ -1,6 +1,5 @@
from data.main import data_dir, connect
from data.main import connect, paths
import numpy as np
import sklearn
from sklearn.cluster import MiniBatchKMeans
import click
from pathlib import Path
@ -11,7 +10,7 @@ from enum import Enum, auto
@click.command(name="mine:embeddings")
def embeddings():
data = np.load(data_dir() / "embeddings.npy")
data = np.load(paths('data') / "embeddings.npy")
kmeans = MiniBatchKMeans(n_clusters=5,
random_state=0,
batch_size=6,
@ -76,7 +75,7 @@ class PlotName(str, Enum):
@click.option('-n', '--name', required=True, type=click.Choice(PlotName))
@click.option('-o', '--output', required=False, type=click.Path())
def plot(name: PlotName, output: Path):
output = output if output else APP_DIR / f'docs/{name}.png'
output = output if output else paths('figures') / f'{name}.png'
if name == PlotName.TitleLength:
fig, ax = plt.subplots(1,1)
data = db.sql("""

36
src/mining/bias.py Normal file
View File

@ -0,0 +1,36 @@
from data.main import connect, map_tld
import os
from pathlib import Path
def normalize():
with connect() as db:
db.sql("""
SELECT
p.name
,count(1) as ctn
,sum(ctn) over() as all
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
JOIN stories s
ON s.id = ps.story_id
GROUP BY
p.name
""")
with connect() as db:
db.sql("""
SELECT
bias
,count(distinct p.id) as publishers
,count(1) as stories
,count(1) / count(distinct p.id) as ratio
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
JOIN stories s
ON s.id = ps.story_id
GROUP BY
p.bias
ORDER BY count(1)
""")

View File

@ -1,9 +1,13 @@
import plots.sentence
import plots.emotion
import plots.sentiment
import plots.links
import plots.classifier
__all__ = [
'sentence'
'emotion',
'sentiment',
'links',
'classifier',
]

View File

@ -1,5 +1,5 @@
import click
from data.main import connect
from data.main import connect, bias_label_to_int, ticklabels
import os
from pathlib import Path
import seaborn as sns
@ -7,54 +7,53 @@ import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:bias-hist')
def hist():
filename = "bias_hist.png"
save_to = paths('figures') / "bias_hist.png"
DB = connect()
data = DB.sql("""
with connect() as db:
data = db.sql("""
SELECT
b.ordinal
p.ordinal
,count(1) as stories
FROM stories s
JOIN publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
JOIN mbfc.publisher_stories ps
ON s.id = ps.story_id
JOIN mbfc.publishers p
ON ps.publisher_id = p.id
WHERE ordinal != -1
GROUP BY
b.ordinal
p.ordinal
""").df()
DB.close()
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
plt.tight_layout()
plt.savefig(out_path / filename)
print(f"saved: {filename}")
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
@click.command('plot:bias-publisher-hist')
def publisher_hist():
filename = "bias_publisher_hist.png"
save_to = paths('figures') / "bias_publisher_hist.png"
DB = connect()
data = DB.sql("""
with connect() as db:
data = db.sql("""
SELECT
b.ordinal
,count(1) as publishers
FROM publisher_bias pb
JOIN bias_ratings b
ON b.id = pb.bias_id
p.ordinal
,count(distinct p.id) as publishers
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
WHERE ordinal != -1
GROUP BY
b.ordinal
p.ordinal
""").df()
DB.close()
ax = sns.barplot(x=data['ordinal'], y=data['publishers'], color='tab:blue')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels)
ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels())
plt.tight_layout()
plt.savefig(out_path / filename)
plt.savefig(save_to)
plt.close()
print(f"saved: {filename}")
print(f"saved: {save_to}")

View File

@ -5,30 +5,32 @@ import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:pca-with-classes')
def pca_with_classes():
filename = "pca_with_classes.png"
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def pca_with_classes(source):
DB = connect()
data = DB.query(f"""
save_to = paths('figures') / f"link_{source}_pca_with_classes.png"
with connect() as db:
df = db.query(f"""
SELECT
p.tld
,b.bias
,p.bias
,c.first
,c.second
,round(cast(b.agree as float) / (b.agree + b.disagree), 2) ratio
FROM top.publishers p
JOIN top.publisher_bias pb
ON p.id = pb.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
JOIN top.publisher_pca_normalized c
--,round(cast(b.agree as float) / (b.agree + b.disagree), 2) ratio
FROM mbfc.publishers p
JOIN publisher_pca_{source} c
ON c.publisher_id = p.id
WHERE p.ordinal != -1
ORDER BY p.ordinal
""").df()
DB.close()
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['bias'], s=100)
ax.set(title="pca components vs. bias labels", xlabel="first pca component", ylabel="second pca component")
plt.savefig(out_dir / filename)
print(f"saved: {filename}")
ax = sns.relplot(df, x='first', y='second', hue='bias', col='bias', s=100, palette='rainbow')
ax.set(xlabel="first pca component",
ylabel="second pca component")
ax.figure.suptitle="pca components vs. bias labels"
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
os.system(f'xdg-open {save_to}')

View File

@ -1,18 +1,16 @@
import click
from data.main import connect
from data.main import connect, paths
import os
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:articles-per-year')
def articles_per_year():
filename = 'articles_per_year.png'
save_to = paths('figures') / 'articles_per_year.png'
DB = connect()
with connect() as db:
data = DB.query("""
select
year(published_at) as year
@ -21,19 +19,19 @@ def articles_per_year():
group by
year(published_at)
""").df()
DB.close()
ax = sns.barplot(x=data.year, y=data.stories, color='tab:blue')
ax.tick_params(axis='x', rotation=90)
ax.set(title="count of articles per year", ylabel="count of stories (#)")
plt.tight_layout()
plt.savefig(out_dir / filename)
plt.savefig(save_to)
print(f"saved: {save_to}")
@click.command('plot:distinct-publishers')
def distinct_publishers():
filename = 'distinct_publishers.png'
save_to = paths('figures') / 'distinct_publishers.png'
DB = connect()
with connect() as db:
data = DB.query("""
select
year(published_at) as year
@ -42,30 +40,32 @@ def distinct_publishers():
group by
year(published_at)
""").df()
DB.close()
ax = sns.barplot(x=data.year, y=data.publishers, color='tab:blue')
ax.tick_params(axis='x', rotation=90)
ax.set(title="count of publishers per year", ylabel="count of publishers (#)")
plt.tight_layout()
plt.savefig(out_dir / filename)
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
@click.command('plot:stories-per-publisher')
def stories_per_publisher():
filename = 'stories_per_publisher.png'
save_to = paths('figures') / 'stories_per_publisher.png'
DB = connect()
data = DB.query("""
with connect() as db:
data = db.query("""
with cte as (
select
publisher_id
,year(published_at) as year
ps.publisher_id
,year(s.published_at) as year
,count(1) as stories
from stories
from stories s
join mbfc.publisher_stories ps
on ps.story_id = s.id
group by
publisher_id
,year(published_at)
ps.publisher_id
,year(s.published_at)
) , agg as (
select
publisher_id
@ -91,64 +91,86 @@ def stories_per_publisher():
group by
max_avg
""").df()
DB.close()
ax = sns.barplot(x=data.max_avg, y=data.publishers, color='tab:blue')
ax.set(title="histogram of publisher stories per year", ylabel="count of publishers (#)", xlabel="max average stories / year")
ax.set(title="histogram of publisher stories per year", ylabel="count of publishers (#)", xlabel="avg. stories / year", xticklabels=['2', '4', '8', '16', '32', '64', '128', '>128'])
plt.tight_layout()
plt.savefig(out_dir / filename)
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
@click.command('plot:top-publishers')
def top_publishers():
"""plot top publishers over time"""
filename = 'top_publishers.png'
save_to = paths('figures') / 'top_publishers.png'
DB = connect()
data = DB.query("""
select
p.tld
,year(published_at) as year
,count(1) as stories
from (
select
with connect() as db:
db.query("""
SELECT
p.tld
,p.id
from top.publishers p
join top.stories s
on s.publisher_id = p.id
group by
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
JOIN stories s
ON s.id = ps.story_id
GROUP BY
p.tld
,p.id
order by count(1) desc
limit 20
) p
join top.stories s
on s.publisher_id = p.id
group by
""")
with connect() as db:
data = db.query("""
WITH p as (
SELECT
p.tld
,year(published_at)
order by count(distinct s.id) desc
,p.id
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
JOIN stories s
ON s.id = ps.story_id
GROUP BY
p.tld
,p.id
order by count(1) desc
limit 20
)
SELECT
p.tld
,YEAR(s.published_at) AS year
,COUNT(1) AS stories
FROM stories s
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN p
ON p.id = ps.publisher_id
GROUP by
p.tld
,YEAR(published_at)
ORDER BY year, COUNT(DISTINCT s.id) DESC
""").df()
DB.close()
pivot = data.pivot(columns='year', index='tld', values='stories')
ax = sns.heatmap(pivot, cmap="crest")
ax.set(title="top 20 publishers (by tld)", ylabel="tld", xlabel="stories / year (#)")
plt.tight_layout()
plt.savefig(out_dir / filename)
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
@click.command('plot:common_tld')
def common_tld():
import dataframe_image as dfi
filename = 'common_tld.png'
save_to = paths('figures') / 'common_tld.png'
DB = connect()
data = DB.query("""
with connect() as db:
data = db.query("""
select
split_part(url, '.', -1) as tld
,count(1) as publishers
@ -162,8 +184,7 @@ def common_tld():
order by
count(1) desc
""").df()
DB.close()
data[:15][['tld', 'publishers']].style.hide(axis="index").export_png(out_dir / filename, table_conversion='matplotlib')
data[:15][['tld', 'publishers']].style.hide(axis="index").export_png(save_to, table_conversion='matplotlib')
def stats():
@ -246,7 +267,7 @@ def stats():
@click.command('plot:bias-stats')
def bias_stats():
import dataframe_image as dfi
filename = 'bias_stats.png'
save_to = paths('figures') / 'bias_stats.png'
DB = connect()
@ -300,3 +321,69 @@ def bias_stats():
""").df()
DB.close()
print(df.to_markdown(index=False))
@click.command('plot:bias-over-time')
def bias_over_time():
"""plot bias labels over time"""
save_to = paths('figures') / 'bias_over_time.png'
with connect() as db:
df = db.sql("""
SELECT
p.bias
,p.id
,date_trunc('year', s.published_at) as year
,count(1) as stories
FROM stories s
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
where year(s.published_at) not in (2006, 2023)
and p.ordinal != -1
GROUP BY
p.bias
,p.id
,p.ordinal
,date_trunc('year', s.published_at)
order by
p.ordinal
,date_trunc('year', s.published_at)
""").df()
ax = sns.relplot(df, kind='line', x='year', y='stories', col='bias', units='id', estimator=None, palette='rainbow')
ax.set(ylabel="stories", xlabel="year")
plt.tight_layout()
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
def bias_missing():
with connect() as db:
df = db.sql("""
SELECT
date_trunc('year', s.published_at) as year
,s.tld
,count(1) as stories
FROM stories s
LEFT JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
WHERE ps.publisher_id is NULL
AND year(s.published_at) not in (2006, 2023)
GROUP BY
s.tld
,date_trunc('year', s.published_at)
HAVING count(1) > 10
ORDER BY
date_trunc('year', s.published_at)
""").df()
ax = sns.lineplot(df, x='year', y='stories', units='tld', estimator=None)
ax.set(ylabel="stories", xlabel="year")
plt.tight_layout()
plt.show()
#plt.savefig(save_to)
plt.close()
#print(f"saved: {save_to}")

View File

@ -1,25 +1,24 @@
import click
from data.main import connect
from data.main import connect, paths, ticklabels
import os
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:emotion-over-time')
def emotion_over_time():
filename = "emotion_over_time.png"
DB = connect()
emotions = DB.sql("""
filename = "emotion_over_time.png"
save_to = paths('figures') / filename
with connect() as db:
emotions = db.sql("""
SELECT
date_trunc('year', s.published_at) AS year
,e.label AS emotion
,count(1) AS stories
FROM top.stories s
FROM stories s
JOIN story_emotions se
ON s.id = se.story_id
JOIN emotions e
@ -28,50 +27,53 @@ def emotion_over_time():
date_trunc('year', s.published_at)
,e.label
""").df()
DB.close()
ax = sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
ax.set(title="title emotions over years", xlabel="year", ylabel="stories (#)")
plt.savefig(out_path / filename)
print(f"saved: {filename}")
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
os.system(f'xdg-open {save_to}')
@click.command('plot:emotion-regression')
def emotion_regression():
"""plot emotion over time as regression"""
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
filename = "emotion_regression.png"
save_to = paths('figures') / filename
DB = connect()
emotions = DB.query("""
SELECT
label
FROM emotions e
""").df()['label'].to_list()
DB.close()
DB = connect()
df = DB.sql(f"""
with connect() as db:
#emotions = db.query("""
# SELECT
# label
# FROM emotions e
#""").df()['label'].to_list()
df = db.sql(f"""
SELECT
epoch(date_trunc('yearweek', s.published_at)) AS date
,e.id AS emotion_id
,p.id as publisher_id
,count(1) AS stories
FROM top.stories s
JOIN top.publishers p
ON p.id = s.publisher_id
FROM stories s
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
JOIN story_emotions se
ON s.id = se.story_id
JOIN emotions e
ON e.id = se.emotion_id
WHERE p.ordinal != -1
GROUP by
epoch(date_trunc('yearweek', s.published_at))
,p.id
,e.id
""").df()
DB.close()
results = []
for (emotion_id, publisher_id), group in df.groupby(['emotion_id', 'publisher_id']):
@ -83,77 +85,59 @@ def emotion_regression():
results.append({'emotion_id' : emotion_id, 'publisher_id':publisher_id, 'per_year' : per_year})
results = pd.DataFrame(results)
DB = connect()
out = DB.query("""
with connect() as db:
out = db.query("""
SELECT
e.label as emotion
--,p.tld
,avg(results.per_year) as avg_reg_coef
,b.ordinal
,p.bias
FROM results
JOIN emotions e
ON e.id = results.emotion_id
JOIN top.publishers p
JOIN mbfc.publishers p
ON p.id = results.publisher_id
JOIN publisher_bias pb
ON pb.publisher_id = results.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
GROUP BY
e.label
,b.ordinal
,p.bias
""").df()
DB.close()
pivot = out.pivot(index=['emotion'], columns=['ordinal'], values=['avg_reg_coef'])
ax = sns.heatmap(pivot, cmap='RdBu_r')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
pivot = out.pivot(index=['emotion'], columns=['bias'], values=['avg_reg_coef'])
ax = sns.heatmap(pivot, cmap='BrBG', vmin=-0.01, vmax=0.01, center=0)
#ax = sns.heatmap(pivot, cmap='RdBu_r', center=0)
ax.set(title="slope of regression (stories/year) by bias and emotion"
,xticklabels=ticklabels
,xticklabels=ticklabels()
,xlabel="bias"
,ylabel="emotion")
plt.tight_layout()
plt.savefig(out_path / filename)
print(f"saved: {filename}")
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
@click.command('plot:emotion-hist')
def emotion_hist():
filename = "emotion_hist.png"
save_to = paths('figures') / filename
DB = connect()
DB.query("""describe story_emotions""")
DB.query("""
select
e.label
,count(distinct s.id) as stories
,count(distinct s.publisher_id) as publishers
from story_emotions se
join emotions e
on e.id = se.emotion_id
join top.stories s
on s.id = se.story_id
group by
e.label
""").df().to_markdown(index=False)
data = DB.sql("""
with connect() as db:
data = db.sql("""
SELECT
b.ordinal
p.bias
,count(1) as stories
FROM stories s
JOIN publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
WHERE p.ordinal != -1
GROUP BY
b.ordinal
p.bias
""").df()
DB.close()
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
ax = sns.barplot(data, x='bias', y='stories', palette='rainbow', order=ticklabels())
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
plt.tight_layout()
plt.savefig(out_path / filename)
print(f"saved: {filename}")
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")

View File

@ -9,20 +9,20 @@ import numpy as np
from sklearn.metrics import silhouette_score
import pandas as pd
out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:link-elbow')
def elbow():
from sklearn.cluster import KMeans
filename = 'link_cluster_elbow.png'
save_to = paths('figures') / 'link_cluster_elbow.png'
DB = connect()
df = DB.query("""
with connect() as db:
df = db.query("""
SELECT
*
FROM link_edges
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
to_plot = []
@ -36,8 +36,9 @@ def elbow():
ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia)
ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
plt.savefig(out_dir / filename)
plt.savefig(save_to)
plt.close()
print(f"saved plot: {save_to}")
# randomly pick 8
@ -45,10 +46,10 @@ def elbow():
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def link_pca_clusters(source):
filename = f"link_pca_clusters_{source}.png"
save_to = paths('figures') / f"link_pca_clusters_{source}.png"
DB = connect()
df = DB.query(f"""
with connect() as db:
df = db.query(f"""
SELECT
c.label as cluster
,p.tld
@ -72,21 +73,17 @@ def link_pca_clusters(source):
JOIN top.publisher_pca_{source} pca
ON pca.publisher_id = p.id
""").df()
DB.close()
ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster'])
ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component")
plt.savefig(out_dir / filename)
# .df().groupby(['cluster', 'bias']).describe()
plt.savefig(save_to)
print(f"saved plot: {save_to}")
def test():
data_dir = Path(os.getenv('DATA_MINING_DATA_DIR'))
DB.query("""
with connect() as db:
db.query("""
SELECT
p.id as publisher_id
,p.name
@ -109,9 +106,6 @@ def test():
ORDER BY count(1) desc
""")
# .df().to_csv(data_dir / 'cluster_publishers.csv', sep="|", index=False)
DB.close()
@click.command('plot:link-confusion')
def link_confusion():
@ -120,9 +114,10 @@ def link_confusion():
from sklearn.metrics import ConfusionMatrixDisplay
filename = "link_confusion.png"
save_to = paths('figures') / filename
DB = connect()
bias = DB.query("""
with connect() as db:
bias = db.query("""
SELECT
p.id as publisher_id
,b.ordinal
@ -133,7 +128,7 @@ def link_confusion():
ON b.id = pb.bias_id
""").df()
df = DB.query("""
df = db.query("""
SELECT
*
FROM top.link_edges
@ -148,6 +143,7 @@ def link_confusion():
from bias
)
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
x = pivot.values
@ -166,9 +162,9 @@ def link_confusion():
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
plt.savefig(out_dir / filename)
plt.savefig(save_to)
plt.close()
print(f"saved plot: {filename}")
print(f"saved plot: {save_to}")
@click.command('plot:link-classifier')
def link_confusion():
@ -176,10 +172,10 @@ def link_confusion():
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay
filename = "link_confusion.png"
save_to = paths('figures') / "link_confusion.png"
DB = connect()
bias = DB.query("""
with connect() as db:
bias = db.query("""
SELECT
p.id as publisher_id
,b.ordinal
@ -190,7 +186,7 @@ def link_confusion():
ON b.id = pb.bias_id
""").df()
df = DB.query("""
df = db.query("""
SELECT
*
FROM top.link_edges
@ -205,12 +201,14 @@ def link_confusion():
from bias
)
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
x = pivot.values
y = bias.sort_values('publisher_id').ordinal
data = DB.query(f"""
with connect() as db:
data = db.query(f"""
SELECT
p.id as publisher_id
,pca.first
@ -235,11 +233,11 @@ def link_confusion():
ConfusionMatrixDisplay.from_predictions(data['ordinal'], data['pred'], ax=ax)
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="confusion matrix for link matrix kNN classifier", xticklabels=ticklabels, yticklabels=ticklabels)
plt.savefig(out_dir / filename)
plt.savefig(save_to)
plt.close()
print(f"saved plot: {filename}")
print(f"saved plot: {save_to}")
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
plt.savefig(out_dir / filename)
plt.close()
print(f"saved plot: {filename}")
# ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
# plt.savefig(out_dir / filename)
# plt.close()
# print(f"saved plot: {filename}")

View File

@ -1,5 +1,5 @@
import click
from data.main import connect
from data.main import connect, paths
import os
from pathlib import Path
import seaborn as sns
@ -7,15 +7,12 @@ import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
@click.command('plot:sentence-pca')
def sentence_pca():
filename = "embedding_sentence_pca.png"
DB = connect()
save_to = paths('figures') / "embedding_sentence_pca.png"
data = DB.query("""
with connect() as db:
data = db.query("""
SELECT
pca.first
,pca.second
@ -28,18 +25,17 @@ def sentence_pca():
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
DB.close()
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
ax.set(title="pca components vs. bias label", xlabel="first component", ylabel="second component")
plt.savefig(out_path / filename)
plt.savefig(save_to)
@click.command('plot:avg-sentence-pca')
def avg_sentence_pca():
filename = "avg_embedding_sentence_pca.png"
DB = connect()
save_to = paths('figures') / "avg_embedding_sentence_pca.png"
data = DB.query("""
with connect() as db:
data = db.query("""
SELECT
pca.first
,pca.second
@ -53,11 +49,10 @@ def avg_sentence_pca():
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
DB.close()
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
ax.set(title="avg. publisher embedding pca components vs. bias label", xlabel="first component", ylabel="second component")
plt.savefig(out_path / filename)
plt.savefig(save_to)
@click.command('plot:sentence-confusion')
def sentence_confusion():
@ -65,14 +60,14 @@ def sentence_confusion():
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay
filename = "sentence_confusion.png"
save_to = paths('figures') / "sentence_confusion.png"
embeddings = np.load(data_path / 'embeddings.npy')
embedding_ids = np.load(data_path / 'embedding_ids.npy')
embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
DB = connect()
data = DB.query("""
with connect() as db:
data = db.query("""
SELECT
ids.index
,s.id
@ -85,12 +80,11 @@ def sentence_confusion():
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
pub = DB.query("""
pub = db.query("""
SELECT
*
FROM top.publishers
""").df()
DB.close()
train, test = train_test_split(data)
train_x, train_y = embeddings[train['index']], train['ordinal']
@ -105,7 +99,7 @@ def sentence_confusion():
ConfusionMatrixDisplay.from_predictions(test_y, pred, ax=ax)
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
plt.savefig(out_path / filename)
plt.savefig(save_to)
plt.close()
print(f"saved plot: {filename}")
print(f"saved plot: {save_to}")

View File

@ -1,20 +1,16 @@
import click
from data.main import connect
import os
from pathlib import Path
from data.main import connect, paths, ticklabels
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:sentiment-over-time')
def over_time():
filename = "sentiment_over_time.png"
DB = connect()
data = DB.sql("""
filename = "sentiment_over_time.png"
save_to = paths('figures') / filename
with connect() as db:
data = db.sql("""
SELECT
avg(sent.class_id) as sentiment
,s.published_at as date
@ -24,115 +20,116 @@ def over_time():
GROUP BY
s.published_at
""").df()
DB.close()
ax = sns.scatterplot(x=data['date'], y=data['sentiment'])
ax.set(title="sentiment vs. time")
plt.tight_layout()
plt.savefig(out_path / filename)
print(f"saved: {filename}")
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
@click.command('plot:bias-vs-sentiment-over-time')
def bias_over_time():
filename = "bias_vs_sentiment_over_time.png"
"""plot sentiment/bias vs. time"""
DB = connect()
data = DB.sql("""
filename = "bias_vs_sentiment_over_time.png"
save_to = paths('figures') / filename
with connect() as db:
data = db.sql("""
with cte as (
SELECT
avg(sent.class_id) as sentiment
,date_trunc('yearweek', s.published_at) as date
--,b.ordinal as ordinal
,b.bias
FROM top.story_sentiments sent
JOIN top.stories s
,p.bias
FROM story_sentiments sent
JOIN stories s
ON s.id = sent.story_id
JOIN publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
WHERE p.ordinal != -1
GROUP BY
date_trunc('yearweek', s.published_at)
,b.bias
,p.bias
)
SELECT
median(sentiment) OVER (PARTITION BY bias ORDER BY date DESC ROWS BETWEEN 0 PRECEDING AND 7 FOLLOWING) as sentiment
,date
,bias
FROM cte
WHERE year(date) not in (2005, 2023)
""").df()
DB.close()
order = ['left', 'left-center', 'center', 'right-center', 'right']
ax = sns.relplot(data, x='date', y='sentiment', col='bias', col_order=order)
#ax = sns.relplot(data, x='date', y='sentiment', col='bias', palette='rainbow', hue='bias', col_order=ticklabels())
ax = sns.lineplot(data, x='date', y='sentiment', palette='rainbow', hue='bias', hue_order=ticklabels())
plt.axhline(y=0.5, color='black', linestyle='--', label='neutral')
ax.set(title='sentiment and bias vs. time', ylabel='8 week rolling avg. sentiment', xlabel='date')
plt.tight_layout()
plt.savefig(out_path / filename)
plt.savefig(save_to)
plt.close()
print(f"saved: {filename}")
print(f"saved: {save_to}")
@click.command('plot:sentiment-recent-winner')
def bias_vs_recent_winner():
filename = "bias_vs_recent_winner.png"
"""plot bias vs. distance to election"""
DB = connect()
data = DB.sql("""
filename = "bias_vs_recent_winner.png"
save_to = paths('figures') / filename
with connect() as db:
data = db.sql("""
SELECT
e.days_away as days_away
,b.ordinal
round(e.days_away, -1) as days_away
,p.bias
,avg(sent.class_id) as sentiment
,count(1) as stories
FROM top.stories s
JOIN top.story_sentiments sent
FROM stories s
JOIN story_sentiments sent
ON s.id = sent.story_id
JOIN election_distance e
ON e.publish_date = s.published_at
JOIN publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
GROUP BY
e.days_away
,b.ordinal
round(e.days_away, -1)
,p.bias
""").df()
DB.close()
data
ax = sns.scatterplot(x=data['days_away'], y=data['sentiment'], hue=data['ordinal'])
ax = sns.scatterplot(data, x='days_away', y='sentiment', hue='bias', hue_order=ticklabels(), palette='rainbow')
ax.set(title="sentiment vs. days to nearest election", xlabel="days to nearest election", ylabel="average title seniment")
plt.tight_layout()
plt.savefig(out_path / filename)
plt.savefig(save_to)
plt.close()
print(f"saved: {filename}")
print(f"saved: {save_to}")
@click.command('plot:sentiment-hist')
def sentiment_hist():
filename = "sentiment_hist.png"
save_to = paths('figures') / filename
DB = connect()
DB.query("""
select
sent.label
,count(distinct s.id) as stories
,count(distinct s.publisher_id) as publishers
from top.story_sentiments sent
join top.stories s
on s.id = sent.story_id
group by
sent.label
""").df().to_markdown(index=False)
data = DB.sql("""
with connect() as db:
data = db.sql("""
SELECT
b.ordinal
p.bias
,count(1) as stories
FROM stories s
JOIN publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
WHERE p.ordinal != -1
GROUP BY
b.ordinal
p.bias
""").df()
DB.close()
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
ax = sns.barplot(data, x='bias', y='stories', hue='bias', palette='rainbow')
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
plt.tight_layout()
plt.savefig(out_path / filename)
print(f"saved: {filename}")
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")

View File

@ -1,48 +0,0 @@
from data.main import connect
import pandas as pd
import numpy as np
DB = connect()
edges = DB.query("""
select
*
from link_edges
""").df()
DB.close()
edges
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
DB = connect()
DB.query("create schema top")
DB.query("""
CREATE OR REPLACE TABLE top.publishers AS
SELECT
p.*
FROM publishers p
JOIN select_publishers s
ON s.publisher_id = p.id
""")
DB.query("""
CREATE OR REPLACE TABLE top.stories AS
SELECT
s.*
FROM stories s
JOIN top.publishers p
ON s.publisher_id = p.id
WHERE year(s.published_at) >= 2006
AND year(s.published_at) < 2023
""")
DB.query("""
CREATE OR REPLACE TABLE top.related_stories AS
SELECT
r.*
FROM top.stories s
JOIN related_stories r
ON s.id = r.parent_id
""")

View File

@ -1,7 +1,7 @@
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from data.main import connect, data_dir
from data.main import connect, paths
import os
from pathlib import Path
import numpy as np
@ -62,7 +62,7 @@ def embed(chunks):
ids = np.concatenate(embedding_ids)
# save embeddings
save_to = data_dir() / 'embeddings.npy'
save_to = paths('data') / 'embeddings.npy'
np.save(save_to, embeddings)
print(f"embeddings saved: {save_to}")
@ -75,15 +75,15 @@ def embed(chunks):
@click.command('sentence:create-avg-pca-table')
def create_avg_pca_table():
from sklearn.decomposition import PCA
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
embeddings = np.load(data_path / 'embeddings.npy')
embedding_ids = np.load(data_path / 'embedding_ids.npy')
embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
DB = connect()
data = DB.query("""
with connect() as db:
data = db.query("""
SELECT
ids.index
,s.id
@ -97,7 +97,6 @@ def create_avg_pca_table():
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
DB.close()
results = []
for publisher_id, group in data.groupby(['publisher_id']):
@ -115,8 +114,8 @@ def create_avg_pca_table():
results['second'] = pred[:, 1]
table_name = "top.publisher_embeddings_pca"
DB = connect()
DB.query(f"""
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
results.publisher_id as publisher_id
@ -124,20 +123,19 @@ def create_avg_pca_table():
,results.second as second
FROM results
""")
DB.close()
print(f"created {table_name}")
@click.command('sentence:create-pca-table')
def create_pca_table():
from sklearn.decomposition import PCA
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
embeddings = np.load(data_path / 'embeddings.npy')
embedding_ids = np.load(data_path / 'embedding_ids.npy')
embeddings = np.load(path('data') / 'embeddings.npy')
embedding_ids = np.load(path('data') / 'embedding_ids.npy')
DB = connect()
data = DB.query("""
with connect() as db:
data = db.query("""
SELECT
ids.index
,s.id
@ -150,12 +148,11 @@ def create_pca_table():
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
pub = DB.query("""
pub = db.query("""
SELECT
*
FROM top.publishers
""").df()
DB.close()
x = embeddings[data['index']]
y = data['ordinal'].to_numpy().reshape(-1, 1)
@ -166,8 +163,8 @@ def create_pca_table():
table_name = f"top.story_embeddings_pca"
DB = connect()
DB.query(f"""
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
data.id as story_id
@ -175,21 +172,20 @@ def create_pca_table():
,data.second as second
FROM data
""")
DB.close()
print(f"created {table_name}")
@click.command('sentence:create-svm-table')
def create_svm_table():
from sklearn import svm
from sklearn.linear_model import SGDClassifier
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
embeddings = np.load(data_path / 'embeddings.npy')
embedding_ids = np.load(data_path / 'embedding_ids.npy')
embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
DB = connect()
data = DB.query("""
with connect() as db:
data = db.query("""
SELECT
ids.index
,s.id