Merge branch 'feature_factcheck'

This commit is contained in:
matt 2023-06-01 09:44:28 -07:00
commit 81f4f37c9d
40 changed files with 1354 additions and 1137 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 21 KiB

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 148 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 235 KiB

After

Width:  |  Height:  |  Size: 73 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 202 KiB

After

Width:  |  Height:  |  Size: 104 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 44 KiB

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 30 KiB

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 87 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 51 KiB

After

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 128 KiB

After

Width:  |  Height:  |  Size: 128 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 22 KiB

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB

After

Width:  |  Height:  |  Size: 53 KiB

27
src/apriori.py Normal file
View File

@ -0,0 +1,27 @@
from efficient_apriori import apriori
from data.main import connect
@click.command("apriori:rules")
def rules():
DB = connect()
data = DB.query("""
SELECT
--list_prepend(parent.id, list(child.id)) as transaction
list_prepend(parent.tld, list(child.tld)) as transaction
FROM stories s
JOIN related_stories r
ON r.parent_id = s.id
JOIN publishers parent
ON parent.id = s.publisher_id
JOIN publishers child
ON child.id = r.publisher_id
GROUP BY
--parent.id
parent.tld
""").df()
DB.close()
transactions = data.transaction.apply(lambda x: tuple(x)).values
itemsets, rules = apriori(transactions, min_support=0.1, min_confidence=0.8)
print(*rules, sep="\n")

View File

@ -1,67 +1,42 @@
import click import click
from data.main import connect from data.main import connect, paths
import pandas as pd import pandas as pd
from lxml import etree from lxml import etree
from pathlib import Path from pathlib import Path
import os import os
import csv import csv
def label_to_int(rating:str) -> int:
mapping = {
'left' : 0,
'left-center' : 1,
'center' : 2,
'right-center' : 3,
'right' : 4,
'allsides' : -1,
}
return mapping[rating]
def int_to_label(class_id: int) -> str:
mapping = {
0 : 'left',
1 : 'left-center',
2 : 'center',
3 : 'right-center',
4 : 'right',
-1 : 'allsides',
}
return mapping[class_id]
@click.command(name="bias:normalize") @click.command(name="bias:normalize")
def normalize() -> None: def normalize() -> None:
DB = connect() with connect() as db:
db.sql("""
DB.sql(""" CREATE OR REPLACE TABLE publisher_bias AS
CREATE OR REPLACE TABLE publisher_bias AS WITH cte AS (
WITH cte AS ( SELECT
SELECT p.id as publisher_id
p.id as publisher_id ,b.id as bias_id
,b.id as bias_id ,b.bias as label
,b.bias as label ,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity
,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity FROM bias_ratings b
FROM bias_ratings b JOIN top.publishers p
JOIN top.publishers p ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95
ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95 ),ranked AS (
),ranked AS ( SELECT
publisher_id
,bias_id
,label
,similarity
,ROW_NUMBER() OVER(PARTITION BY publisher_id ORDER BY similarity DESC) AS rn
FROM cte
)
SELECT SELECT
publisher_id publisher_id
,bias_id
,label ,label
,similarity ,bias_id
,ROW_NUMBER() OVER(PARTITION BY publisher_id ORDER BY similarity DESC) AS rn FROM ranked
FROM cte WHERE ranked.rn = 1
) """)
SELECT
publisher_id
,label
,bias_id
FROM ranked
WHERE ranked.rn = 1
""")
mapping = [ mapping = [
{'label' :'left' , 'ordinal': -2}, {'label' :'left' , 'ordinal': -2},
@ -72,22 +47,20 @@ def normalize() -> None:
] ]
mapping = pd.DataFrame(mapping) mapping = pd.DataFrame(mapping)
DB.query("alter table bias_ratings add column ordinal int") with connect() as db:
db.query("alter table bias_ratings add column ordinal int")
DB.query(""" db.query("""
update bias_ratings b update bias_ratings b
set ordinal = o.ordinal set ordinal = o.ordinal
FROM mapping o FROM mapping o
WHERE o.label = b.bias WHERE o.label = b.bias
""") """)
@click.command(name='bias:parse') @click.command(name='bias:parse')
def parse() -> None: def parse() -> None:
"""parse the save html page of allslides.com bias ratings into a normalized csv file""" """parse the save html page of allslides.com bias ratings into a normalized csv file"""
DB = connect() bias_html = paths('data') / 'allsides.html'
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
bias_html = DATA_DIR / 'allsides.html'
parser = etree.HTMLParser() parser = etree.HTMLParser()
tree = etree.parse(str(bias_html), parser) tree = etree.parse(str(bias_html), parser)
@ -111,65 +84,63 @@ def parse() -> None:
rating['disagree'] = int(disagree) rating['disagree'] = int(disagree)
ratings.append(rating) ratings.append(rating)
df = pd.DataFrame(ratings) df = pd.DataFrame(ratings)
df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC) df.to_csv(paths('data') / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
@click.command(name="bias:load") @click.command(name="bias:load")
def load() -> None: def load() -> None:
DB = connect() f = str(paths('data') / "bias_ratings.csv")
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
f = str(DATA_DIR / "bias_ratings.csv")
DB.sql(f""" with connect() as db:
CREATE TABLE bias_ratings as db.sql(f"""
select CREATE TABLE bias_ratings as
row_number() over(order by b.publisher) as id select
,b.* row_number() over(order by b.publisher) as id
from read_csv_auto('{f}') b ,b.*
""") from read_csv_auto('{f}') b
""")
@click.command('bias:export') @click.command('bias:export')
def export(): def export():
data_path = Path(os.environ['DATA_MINING_DATA_DIR']) with connect() as db:
all_bias = db.query("""
SELECT
id as bias_id
,publisher as name
,bias as label
FROM bias_ratings
ORDER by agree desc
""")
DB = connect() all_bias.df().to_csv(paths('data') / 'TMP_publisher_bias.csv', sep="|", index=False)
all_bias = DB.query(""" with connect() as db:
SELECT mapped_bias = db.query("""
id as bias_id SELECT
,publisher as name p.id as publisher_id
,bias as label ,p.name as name
FROM bias_ratings ,p.tld as tld
ORDER by agree desc ,b.label as bias
,b.bias_id as bias_id
FROM top.publishers p
LEFT JOIN publisher_bias b
ON b.publisher_id = p.id
""") """)
all_bias.df().to_csv(data_path / 'TMP_publisher_bias.csv', sep="|", index=False) mapped_bias.df().to_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|", index=False)
mapped_bias = DB.query("""
SELECT
p.id as publisher_id
,p.name as name
,p.tld as tld
,b.label as bias
,b.bias_id as bias_id
FROM top.publishers p
LEFT JOIN publisher_bias b
ON b.publisher_id = p.id
""")
mapped_bias.df().to_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|", index=False)
DB.close()
@click.command('bias:import-mapped') @click.command('bias:import-mapped')
def import_mapped(): def import_mapped():
data_path = Path(os.environ['DATA_MINING_DATA_DIR'])
table_name = "top.publisher_bias" table_name = "top.publisher_bias"
DB = connect() df = pd.read_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|")
df = pd.read_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|")
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
publisher_id AS publisher_id
,cast(bias_id AS int) as bias_id
FROM df
WHERE bias_id IS NOT NULL
""")
DB.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
publisher_id AS publisher_id
,cast(bias_id AS int) as bias_id
FROM df
WHERE bias_id IS NOT NULL
""")
print(f"created table: {table_name}") print(f"created table: {table_name}")

View File

@ -1,5 +1,7 @@
import click import click
from dotenv import load_dotenv from dotenv import load_dotenv
import data
import plots
@click.group() @click.group()
def cli(): def cli():
@ -7,12 +9,20 @@ def cli():
if __name__ == "__main__": if __name__ == "__main__":
load_dotenv() load_dotenv()
from data import scrape
cli.add_command(scrape.download) # original bias ratings
cli.add_command(scrape.parse) cli.add_command(data.scrape.download)
cli.add_command(scrape.load) cli.add_command(data.scrape.parse)
cli.add_command(scrape.normalize) cli.add_command(data.scrape.load)
cli.add_command(scrape.create_elections_table) cli.add_command(data.scrape.normalize)
cli.add_command(data.scrape.create_elections_table)
cli.add_command(data.factcheck.parse_index)
cli.add_command(data.factcheck.scrape)
cli.add_command(data.links.create_table)
cli.add_command(data.links.create_pca)
cli.add_command(data.links.create_clusters)
import word import word
# cli.add_command(word.distance) # cli.add_command(word.distance)
@ -23,10 +33,12 @@ if __name__ == "__main__":
cli.add_command(bias.parse) cli.add_command(bias.parse)
cli.add_command(bias.load) cli.add_command(bias.load)
cli.add_command(bias.normalize) cli.add_command(bias.normalize)
import mine import mine
cli.add_command(mine.embeddings) cli.add_command(mine.embeddings)
cli.add_command(mine.cluster) cli.add_command(mine.cluster)
cli.add_command(mine.plot) cli.add_command(mine.plot)
import emotion import emotion
cli.add_command(emotion.extract) cli.add_command(emotion.extract)
cli.add_command(emotion.normalize) cli.add_command(emotion.normalize)
@ -40,34 +52,20 @@ if __name__ == "__main__":
from train import main as train_main from train import main as train_main
cli.add_command(train_main.main) cli.add_command(train_main.main)
import plots.descriptive as plotd cli.add_command(plots.descriptive.articles_per_year)
cli.add_command(plotd.articles_per_year) cli.add_command(plots.descriptive.distinct_publishers)
cli.add_command(plotd.distinct_publishers) cli.add_command(plots.descriptive.stories_per_publisher)
cli.add_command(plotd.stories_per_publisher) cli.add_command(plots.descriptive.top_publishers)
cli.add_command(plotd.top_publishers) cli.add_command(plots.descriptive.common_tld)
cli.add_command(plotd.common_tld)
import links as linkcli
cli.add_command(linkcli.create_table)
cli.add_command(linkcli.create_pca)
cli.add_command(linkcli.create_clusters)
import plots.links as plotl
cli.add_command(plotl.elbow)
cli.add_command(plotl.link_pca_clusters)
import plots.classifier as plotc
cli.add_command(plotc.pca_with_classes)
import plots
cli.add_command(plots.sentence.sentence_pca) cli.add_command(plots.sentence.sentence_pca)
cli.add_command(plots.sentence.avg_sentence_pca) cli.add_command(plots.sentence.avg_sentence_pca)
cli.add_command(plots.emotion.emotion_over_time) cli.add_command(plots.emotion.emotion_over_time)
cli.add_command(plots.emotion.emotion_regression) cli.add_command(plots.emotion.emotion_regression)
cli.add_command(plots.sentiment.over_time) cli.add_command(plots.sentiment.over_time)
cli.add_command(plots.sentiment.bias_over_time) cli.add_command(plots.sentiment.bias_over_time)
cli.add_command(plots.sentiment.bias_vs_recent_winner) cli.add_command(plots.sentiment.bias_vs_recent_winner)
cli.add_command(plots.links.elbow)
cli.add_command(plots.links.link_pca_clusters)
cli.add_command(plots.classifier.pca_with_classes)
cli() cli()

View File

@ -1,6 +1,10 @@
import data.main import data.main
import data.scrape import data.scrape
import data.factcheck
import data.links
__all__ = [ __all__ = [
'main' 'main'
,'scrape' ,'scrape'
,'factcheck'
,'links'
] ]

171
src/data/factcheck.py Normal file
View File

@ -0,0 +1,171 @@
import requests
from lxml import etree
from bs4 import BeautifulSoup
import re
from io import BytesIO
import pandas as pd
from pathlib import Path
import os
import sys
import click
from data.main import connect, map_tld, paths
from random import randint
from time import sleep
from tqdm import tqdm
@click.command('mbfc:parse-index')
def parse_index():
parser = etree.HTMLParser()
publishers = []
for page in range(1, 54):
url = f"https://mediabiasfactcheck.com/filtered-search/?pg={page}"
print(f"downloading {url}", file=sys.stderr)
response = requests.get(url)
html = response.content
tree = etree.parse(BytesIO(html), parser)
rows = tree.xpath('//table[@class="mbfc-table"]/tbody/tr')
print(f"parsing {len(rows)} rows", file=sys.stderr)
for row in rows:
publisher = {}
link, bias, reporting, country, credibility, media_type, traffic, popularity = tuple(col for col in row.iterchildren())
link = link.xpath('./a')[0]
publisher['name'] = link.text
publisher['detail_url'] = link.get('href')
publisher['bias'] = bias.text
publisher['reporting'] = reporting.text
publisher['country'] = country.text
publisher['credibility'] = credibility.text
publisher['media_type'] = media_type.text
publisher['traffic'] = traffic.text
publisher['popularity'] = popularity.xpath('./span')[0].text
publishers.append(publisher)
df = pd.DataFrame(publishers)
save_to = paths('data') / 'mbfc_bias.csv'
df.to_csv(save_to, sep='|', index=False)
print(f"saved {len(df)}: {save_to}", file=sys.stderr)
@click.command("mbfc:schema")
def schema():
with connect() as db:
db.sql("""create schema mbfc""")
db.sql("""create or replace table mbfc.scrape (
url text
,scraped_at datetime default now()
)
""")
@click.command("mbfc:scrape")
def scrape():
df = pd.read_csv(paths('data') / 'mbfc_bias.csv', sep="|")
with connect() as db:
stats = db.query("""
select
count(1) filter(where s.url is not null) as elapsed
,count(1) filter(where s.url is null) as remaining
from df
left join mbfc.scrape s
on df.detail_url = s.url
""").fetchall()
df = db.query("""
select
detail_url as url
from df
where df.detail_url not in (
select
url
from mbfc.scrape
)
""").df()
print(f"{stats[0][0]} elapsed. {stats[0][1]} remaining.")
for url in df.url:
delay = randint(1,3)
save_as = paths('data') / 'mbfc' / (url.strip('/').split('/')[-1] + '.html')
print(f"downloading (delay: {delay}): {url}", file=sys.stderr)
sleep(delay)
try:
response = requests.get(url)
except Exception as e:
print(f"request failed: {url}", file=sys.stderr)
continue
with open(save_as, 'w') as f:
f.write(response.text)
with connect() as db:
db.execute("""insert into mbfc.scrape (url) values (?)""", [url])
print(f"saved: {save_as}", file=sys.stderr)
def load():
publishers = []
for i, page in enumerate(tqdm((paths('data') / 'mbfc').iterdir())):
publisher = {}
publisher['origin_url'] = f"https://mediabiasfactcheck.com/{page.stem}"
with page.open() as p:
tree = BeautifulSoup(p, 'html.parser')
for e in tree(string=re.compile(r'source:', re.IGNORECASE)):
e = e.parent
while e.name != 'p':
e = e.parent
l = e.find('a')
if l:
publisher['tld'] = l.get('href')
break
else:
breakpoint()
publishers.append(publisher)
df = pd.DataFrame(publishers)
df.to_csv(paths('data') / 'mbfc_publisher_url.csv', index=False, sep="|")
@click.command('mbfc:create-tables')
def create_tables():
pubs = pd.read_csv(paths('data') / 'mbfc_publishers.csv', sep='|')
urls = pd.read_csv(paths('data') / 'mbfc_publisher_url.csv', sep="|")
df = pubs.merge(urls, on='mbfc_url')
df['tld'] = df.tld.apply(map_tld)
df['ordinal'] = df.bias.apply(bias_label_to_int)
with connect() as db:
db.sql("""
CREATE OR REPLACE TABLE mbfc.publishers AS
SELECT
row_number() over() as id
,p.tld
,mode(p.name) as name
,mode(p.bias) as bias
,mode(p.ordinal) as ordinal
,mode(p.reporting) as reporting
,mode(p.country) as country
,mode(p.credibility) as credibility
,mode(p.media_type) as media_type
,mode(p.traffic) as traffic
,mode(p.popularity) as popularity
FROM df p
GROUP BY
p.tld
""")
with connect() as db:
raw_stories = db.sql("""
SELECT
*
FROM stories s
""").df()
stories['tld'] = stories.url.apply(map_tld)
with connect() as db:
db.sql("""
CREATE OR REPLACE TABLE mbfc.publisher_stories AS
SELECT
s.id as story_id
,p.id as publisher_id
FROM raw_stories s
JOIN mbfc.publishers p
ON p.tld = s.tld
""")

135
src/data/links.py Normal file
View File

@ -0,0 +1,135 @@
import click
from data.main import connect
import pandas as pd
@click.command('links:create-table')
def create_table():
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE link_edges AS
with cte as(
SELECT
s.publisher_id as parent_id
,r.publisher_id as child_id
,count(1) as links
FROM stories s
JOIN related_stories r
ON s.id = r.parent_id
group by
s.publisher_id
,r.publisher_id
)
SELECT
cte.parent_id
,cte.child_id
,cte.links as links
,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
,case when cte.links > 0 then 1 else 0 end as onehot
FROM cte
WHERE cte.child_id in (
SELECT
distinct parent_id
FROM cte
)
AND cte.parent_id in (
SELECT
distinct child_id
FROM cte
)
""")
db.query("""
SELECT
*
,count(1) over()
FROM link_edges e
limit 1
""")
print(f"created link_edges")
@click.command('links:create-pca')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_pca(source):
"""create 2D pca labels"""
from sklearn.decomposition import PCA
table_name = f"publisher_pca_{source}"
with connect() as db:
pub = db.query("""
SELECT
p.*
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON p.id = ps.publisher_id
""").df()
df = db.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM link_edges
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
svd = PCA(n_components=2)
svd_out = svd.fit_transform(pivot)
out = pivot.reset_index()[['parent_id']]
out['first'] = svd_out[:, 0]
out['second'] = svd_out[:, 1]
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
out.id as publisher_id
,out.first as first
,out.second as second
FROM out
""")
print(f"created {table_name}")
@click.command('links:create-clusters')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_clusters(source):
"""create link adj. matrix clusters table"""
from sklearn.cluster import KMeans
table_name = f"publisher_clusters_{source}"
with connect() as db:
df = db.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM link_edges
""").df()
pub = db.query("""
SELECT
p.*
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
k = 8
kmeans = KMeans(n_clusters=k, n_init="auto")
pred = kmeans.fit_predict(pivot)
out = pivot.reset_index()[['parent_id']]
out['label'] = pred
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
new_table = out[['id', 'label']]
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
n.id as publisher_id
,n.label as label
FROM new_table n
""")
print(f"created {table_name}")

View File

@ -2,6 +2,10 @@ import os
from pathlib import Path from pathlib import Path
import duckdb import duckdb
from enum import Enum from enum import Enum
from urllib.parse import urlparse
from tld import get_tld
from tld.utils import update_tld_names
import sys
class Data(str, Enum): class Data(str, Enum):
Titles = 'titles' Titles = 'titles'
@ -9,6 +13,16 @@ class Data(str, Enum):
def data_dir(): def data_dir():
return Path(os.environ['DATA_MINING_DATA_DIR']) return Path(os.environ['DATA_MINING_DATA_DIR'])
def paths(name='app'):
if 'app' in name:
return Path(os.environ['DATA_MINING_APP_DIR'])
if 'data' in name:
return Path(os.environ['DATA_MINING_DATA_DIR'])
if 'doc' in name:
return Path(os.environ['DATA_MINING_DOCS_DIR'])
if 'figure' in name:
return Path(os.environ['DATA_MINING_DOCS_DIR']) / 'figures'
def connect(): def connect():
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
# APP_DIR = Path(os.environ['DATA_MINING_APP_DIR']) # APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
@ -28,3 +42,66 @@ def from_db(t: Data):
limit 100 limit 100
""").df() """).df()
return table return table
def map_tld(x):
try:
res = get_tld(x, as_object=True)
return res.fld
except:
print(f"'{x}' is not valid.", file=sys.stderr)
return None
def ticklabels():
return [
'Left',
'Left-Center',
'Least Biased',
'Right-Center',
'Right',
]
def bias_label_to_int(rating:str, source: str = 'mbfc') -> int:
if source == 'mbfc':
mapping = {
'Left' : 0,
'Left-Center' : 1,
'Least Biased' : 2,
'Right-Center' : 3,
'Right' : 4,
}
else:
mapping = {
'left' : 0,
'left-center' : 1,
'center' : 2,
'right-center' : 3,
'right' : 4,
}
try:
return mapping[rating]
except:
print(f"no mapping for {rating}", file=sys.stderr)
return -1
def bias_int_to_label(class_id: int, source: str = 'mbfc') -> str:
if source == 'mbfc':
mapping = {
0 : 'Left',
1 : 'Left-Center',
2 : 'Least Biased',
3 : 'Right-Center',
4 : 'Right',
}
else:
mapping = {
0 : 'left',
1 : 'left-center',
2 : 'center',
3 : 'right-center',
4 : 'right',
}
try:
return mapping[class_id]
except:
print(f"no mapping for {class_id}", file=sys.stderr)
return -1

View File

@ -319,12 +319,6 @@ def another_norm():
""") """)
def map_tld(x):
try:
res = get_tld(x, as_object=True)
return res.fld
except:
return None
DB.sql(""" DB.sql("""
SELECT SELECT

47
src/data/selection.py Normal file
View File

@ -0,0 +1,47 @@
from data.main import connect
import pandas as pd
import numpy as np
def create_tables():
with connect() as db:
edges = db.query("""
select
*
from link_edges
""").df()
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
with connect() as db:
db.query("create schema top")
db.query("""
CREATE OR REPLACE TABLE top.publishers AS
SELECT
p.*
FROM publishers p
JOIN select_publishers s
ON s.publisher_id = p.id
""")
db.query("""
CREATE OR REPLACE TABLE top.stories AS
SELECT
s.*
FROM stories s
JOIN top.publishers p
ON s.publisher_id = p.id
WHERE year(s.published_at) >= 2006
AND year(s.published_at) < 2023
""")
db.query("""
CREATE OR REPLACE TABLE top.related_stories AS
SELECT
r.*
FROM top.stories s
JOIN related_stories r
ON s.id = r.parent_id
""")

View File

@ -1,10 +1,11 @@
import click
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from data import connect, data_dir from data.main import connect, paths
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
import click import pandas as pd
@click.option('-c', '--chunks', type=int, default=500, show_default=True) @click.option('-c', '--chunks', type=int, default=500, show_default=True)
@click.command("sentiment:extract") @click.command("sentiment:extract")
@ -67,20 +68,19 @@ def extract(chunks):
@click.command('sentiment:load') @click.command('sentiment:load')
def load(): def load():
DB = connect() sentiments = np.load(paths('data') / 'sentiment.npy')
sentiments = np.load(data_dir() / 'sentiment.npy') story_ids = np.load(paths('data') / 'sentiment_ids.npy')
story_ids = np.load(data_dir() / 'sentiment_ids.npy')
data = pd.DataFrame(story_ids, columns=['story_id']).reset_index() data = pd.DataFrame(story_ids, columns=['story_id']).reset_index()
data['sentiment_id'] = sentiments data['sentiment_id'] = sentiments
DB.query(""" with connect() as db:
CREATE OR REPLACE TABLE top.story_sentiments AS db.query("""
SELECT CREATE OR REPLACE TABLE story_sentiments AS
data.story_id SELECT
,data.sentiment_id as class_id data.story_id
,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label ,data.sentiment_id as class_id
FROM data ,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label
JOIN top.stories s FROM data
ON s.id = data.story_id JOIN stories s
""") ON s.id = data.story_id
DB.close() """)

View File

@ -1,255 +0,0 @@
import click
from data.main import connect
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
@click.command('links:create-table')
def create_table():
table_name = "top.link_edges"
DB = connect()
DB.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
with cte as(
SELECT
s.publisher_id as parent_id
,r.publisher_id as child_id
,count(1) as links
FROM top.stories s
JOIN top.related_stories r
ON s.id = r.parent_id
group by
s.publisher_id
,r.publisher_id
)
SELECT
cte.parent_id
,cte.child_id
,cte.links as links
,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
,case when cte.links > 0 then 1 else 0 end as onehot
FROM cte
WHERE cte.child_id in (
SELECT
distinct parent_id
FROM cte
)
AND cte.parent_id in (
SELECT
distinct child_id
FROM cte
)
""")
DB.close()
DB = connect()
DB.query("""
SELECT
*
,-log10(links)
--distinct parent_id
FROM top.link_edges e
WHERE e.parent_id = 238
""")
DB.close()
print(f"created {table_name}")
@click.command('links:create-pca')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_pca(source):
"""create 2D pca labels"""
from sklearn.decomposition import PCA
table_name = f"top.publisher_pca_{source}"
DB = connect()
pub = DB.query("""
SELECT
*
FROM top.publishers
""").df()
df = DB.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM top.link_edges
""").df()
DB.close()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
svd = PCA(n_components=2)
svd_out = svd.fit_transform(pivot)
out = pivot.reset_index()[['parent_id']]
out['first'] = svd_out[:, 0]
out['second'] = svd_out[:, 1]
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
DB = connect()
DB.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
out.id as publisher_id
,out.first as first
,out.second as second
FROM out
""")
DB.close()
print(f"created {table_name}")
@click.command('links:create-clusters')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_clusters(source):
from sklearn.cluster import KMeans
table_name = f"top.publisher_clusters_{source}"
DB = connect()
df = DB.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM top.link_edges
""").df()
pub = DB.query("""
SELECT
*
FROM top.publishers
""").df()
DB.close()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
k = 8
kmeans = KMeans(n_clusters=k, n_init="auto")
pred = kmeans.fit_predict(pivot)
out = pivot.reset_index()[['parent_id']]
out['label'] = pred
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
new_table = out[['id', 'label']]
DB = connect()
DB.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
n.id as publisher_id
,n.label as label
FROM new_table n
""")
DB.close()
print(f"created {table_name}")
def to_matrix():
"""returns an adjacency matrix of publishers to publisher link frequency"""
DB = connect()
bias_map = pd.DataFrame([
{'label' :'left', 'value' : 0},
{'label' :'left-center', 'value' : 1},
{'label' :'center', 'value' : 2},
{'label' :'right-center', 'value' : 3},
{'label' :'right', 'value' : 4},
{'label' :'allsides', 'value' : -1},
])
bias = DB.sql("""
SELECT
b.id
,b.label
,m.value
FROM publisher_bias b
JOIN bias_map m
ON b.label = m.label
WHERE value != -1
""").df()
pub = DB.sql("""
select
p.id
,p.name
,p.url
from publishers p
""").df()
edges = DB.sql("""
WITH total as (
SELECT
s.publisher_id as id
,COUNT(1) as stories
FROM stories s
GROUP BY
s.publisher_id
), p as (
SELECT
p.id
,stories
FROM publishers p
LEFT JOIN total t
ON t.id = p.id
WHERE t.stories >= 20
), cte as (
SELECT
r.publisher_id as child_id
,s.publisher_id as parent_id
,count(1) as links
FROM related_stories r
JOIN stories s
ON s.id = r.parent_id
group by
s.publisher_id
,r.publisher_id
)
SELECT
p.id as parent_id
,cte.child_id
,links
FROM p
left JOIN cte
ON p.id = cte.parent_id
""").df()
# only keep values that have more than 1 link
test = edges[edges['links'] > 2].pivot(index='parent_id', columns='child_id', values='links').fillna(0).reset_index()
edges.dropna().pivot(index='parent_id', columns='child_id', values='links').fillna(0)
pd.merge(adj, pub, how='left', left_on='parent_id', right_on='id')
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
adj.values.shape
out = pd.DataFrame(adj.index.values, columns=['id'])
out = pd.merge(out, pub, how='left', on='id')
return out
@click.command('links:analysis')
def analysis():
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import MiniBatchKMeans
adj = to_matrix()
pca = PCA(n_components=4)
pca_out = pca.fit_transform(adj)
svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
svd_out = svd.fit_transform(adj)
x = svd_out[:, 0]
y = svd_out[:, 1]
x = pca_out[:, 0]
y = pca_out[:, 1]
sns.scatterplot(x=x, y=y)
plt.show()
kmeans = MiniBatchKMeans(n_clusters=5, random_state=0, batch_size=6, n_init="auto")
pred = kmeans.fit_predict(pca_out)
sns.scatterplot(x=x, y=y, hue=pred)
plt.show()
sns.scatterplot(x=pub['first'], y=pub['second'], hue=pub['bias'])
plt.show()

View File

@ -1,6 +1,5 @@
from data.main import data_dir, connect from data.main import connect, paths
import numpy as np import numpy as np
import sklearn
from sklearn.cluster import MiniBatchKMeans from sklearn.cluster import MiniBatchKMeans
import click import click
from pathlib import Path from pathlib import Path
@ -11,7 +10,7 @@ from enum import Enum, auto
@click.command(name="mine:embeddings") @click.command(name="mine:embeddings")
def embeddings(): def embeddings():
data = np.load(data_dir() / "embeddings.npy") data = np.load(paths('data') / "embeddings.npy")
kmeans = MiniBatchKMeans(n_clusters=5, kmeans = MiniBatchKMeans(n_clusters=5,
random_state=0, random_state=0,
batch_size=6, batch_size=6,
@ -76,7 +75,7 @@ class PlotName(str, Enum):
@click.option('-n', '--name', required=True, type=click.Choice(PlotName)) @click.option('-n', '--name', required=True, type=click.Choice(PlotName))
@click.option('-o', '--output', required=False, type=click.Path()) @click.option('-o', '--output', required=False, type=click.Path())
def plot(name: PlotName, output: Path): def plot(name: PlotName, output: Path):
output = output if output else APP_DIR / f'docs/{name}.png' output = output if output else paths('figures') / f'{name}.png'
if name == PlotName.TitleLength: if name == PlotName.TitleLength:
fig, ax = plt.subplots(1,1) fig, ax = plt.subplots(1,1)
data = db.sql(""" data = db.sql("""

36
src/mining/bias.py Normal file
View File

@ -0,0 +1,36 @@
from data.main import connect, map_tld
import os
from pathlib import Path
def normalize():
with connect() as db:
db.sql("""
SELECT
p.name
,count(1) as ctn
,sum(ctn) over() as all
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
JOIN stories s
ON s.id = ps.story_id
GROUP BY
p.name
""")
with connect() as db:
db.sql("""
SELECT
bias
,count(distinct p.id) as publishers
,count(1) as stories
,count(1) / count(distinct p.id) as ratio
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
JOIN stories s
ON s.id = ps.story_id
GROUP BY
p.bias
ORDER BY count(1)
""")

View File

@ -1,9 +1,13 @@
import plots.sentence import plots.sentence
import plots.emotion import plots.emotion
import plots.sentiment import plots.sentiment
import plots.links
import plots.classifier
__all__ = [ __all__ = [
'sentence' 'sentence'
'emotion', 'emotion',
'sentiment', 'sentiment',
'links',
'classifier',
] ]

View File

@ -1,5 +1,5 @@
import click import click
from data.main import connect from data.main import connect, bias_label_to_int, ticklabels
import os import os
from pathlib import Path from pathlib import Path
import seaborn as sns import seaborn as sns
@ -7,54 +7,53 @@ import matplotlib.pyplot as plt
import numpy as np import numpy as np
import pandas as pd import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:bias-hist') @click.command('plot:bias-hist')
def hist(): def hist():
filename = "bias_hist.png" save_to = paths('figures') / "bias_hist.png"
with connect() as db:
data = db.sql("""
SELECT
p.ordinal
,count(1) as stories
FROM stories s
JOIN mbfc.publisher_stories ps
ON s.id = ps.story_id
JOIN mbfc.publishers p
ON ps.publisher_id = p.id
WHERE ordinal != -1
GROUP BY
p.ordinal
""").df()
DB = connect()
data = DB.sql("""
SELECT
b.ordinal
,count(1) as stories
FROM stories s
JOIN publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
GROUP BY
b.ordinal
""").df()
DB.close()
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue') ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
plt.tight_layout() plt.tight_layout()
plt.savefig(out_path / filename) plt.savefig(save_to)
print(f"saved: {filename}") plt.close()
print(f"saved: {save_to}")
@click.command('plot:bias-publisher-hist') @click.command('plot:bias-publisher-hist')
def publisher_hist(): def publisher_hist():
filename = "bias_publisher_hist.png" save_to = paths('figures') / "bias_publisher_hist.png"
DB = connect() with connect() as db:
data = DB.sql(""" data = db.sql("""
SELECT SELECT
b.ordinal p.ordinal
,count(1) as publishers ,count(distinct p.id) as publishers
FROM publisher_bias pb FROM mbfc.publishers p
JOIN bias_ratings b JOIN mbfc.publisher_stories ps
ON b.id = pb.bias_id ON ps.publisher_id = p.id
GROUP BY WHERE ordinal != -1
b.ordinal GROUP BY
""").df() p.ordinal
DB.close() """).df()
ax = sns.barplot(x=data['ordinal'], y=data['publishers'], color='tab:blue') ax = sns.barplot(x=data['ordinal'], y=data['publishers'], color='tab:blue')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels())
ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels)
plt.tight_layout() plt.tight_layout()
plt.savefig(out_path / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved: {filename}") print(f"saved: {save_to}")

View File

@ -5,30 +5,32 @@ import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from pathlib import Path from pathlib import Path
out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:pca-with-classes') @click.command('plot:pca-with-classes')
def pca_with_classes(): @click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
filename = "pca_with_classes.png" def pca_with_classes(source):
DB = connect() save_to = paths('figures') / f"link_{source}_pca_with_classes.png"
data = DB.query(f"""
SELECT with connect() as db:
p.tld df = db.query(f"""
,b.bias SELECT
,c.first p.tld
,c.second ,p.bias
,round(cast(b.agree as float) / (b.agree + b.disagree), 2) ratio ,c.first
FROM top.publishers p ,c.second
JOIN top.publisher_bias pb --,round(cast(b.agree as float) / (b.agree + b.disagree), 2) ratio
ON p.id = pb.publisher_id FROM mbfc.publishers p
JOIN bias_ratings b JOIN publisher_pca_{source} c
ON b.id = pb.bias_id ON c.publisher_id = p.id
JOIN top.publisher_pca_normalized c WHERE p.ordinal != -1
ON c.publisher_id = p.id ORDER BY p.ordinal
""").df() """).df()
DB.close()
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['bias'], s=100) ax = sns.relplot(df, x='first', y='second', hue='bias', col='bias', s=100, palette='rainbow')
ax.set(title="pca components vs. bias labels", xlabel="first pca component", ylabel="second pca component") ax.set(xlabel="first pca component",
plt.savefig(out_dir / filename) ylabel="second pca component")
print(f"saved: {filename}") ax.figure.suptitle="pca components vs. bias labels"
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
os.system(f'xdg-open {save_to}')

View File

@ -1,169 +1,190 @@
import click import click
from data.main import connect from data.main import connect, paths
import os import os
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:articles-per-year') @click.command('plot:articles-per-year')
def articles_per_year(): def articles_per_year():
filename = 'articles_per_year.png' save_to = paths('figures') / 'articles_per_year.png'
DB = connect() with connect() as db:
data = DB.query(""" data = DB.query("""
select select
year(published_at) as year year(published_at) as year
,count(1) as stories ,count(1) as stories
from stories from stories
group by group by
year(published_at) year(published_at)
""").df() """).df()
DB.close()
ax = sns.barplot(x=data.year, y=data.stories, color='tab:blue') ax = sns.barplot(x=data.year, y=data.stories, color='tab:blue')
ax.tick_params(axis='x', rotation=90) ax.tick_params(axis='x', rotation=90)
ax.set(title="count of articles per year", ylabel="count of stories (#)") ax.set(title="count of articles per year", ylabel="count of stories (#)")
plt.tight_layout() plt.tight_layout()
plt.savefig(out_dir / filename) plt.savefig(save_to)
print(f"saved: {save_to}")
@click.command('plot:distinct-publishers') @click.command('plot:distinct-publishers')
def distinct_publishers(): def distinct_publishers():
filename = 'distinct_publishers.png' save_to = paths('figures') / 'distinct_publishers.png'
DB = connect() with connect() as db:
data = DB.query(""" data = DB.query("""
select select
year(published_at) as year year(published_at) as year
,count(distinct publisher_id) as publishers ,count(distinct publisher_id) as publishers
from stories from stories
group by group by
year(published_at) year(published_at)
""").df() """).df()
DB.close()
ax = sns.barplot(x=data.year, y=data.publishers, color='tab:blue') ax = sns.barplot(x=data.year, y=data.publishers, color='tab:blue')
ax.tick_params(axis='x', rotation=90) ax.tick_params(axis='x', rotation=90)
ax.set(title="count of publishers per year", ylabel="count of publishers (#)") ax.set(title="count of publishers per year", ylabel="count of publishers (#)")
plt.tight_layout() plt.tight_layout()
plt.savefig(out_dir / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved: {save_to}")
@click.command('plot:stories-per-publisher') @click.command('plot:stories-per-publisher')
def stories_per_publisher(): def stories_per_publisher():
filename = 'stories_per_publisher.png' save_to = paths('figures') / 'stories_per_publisher.png'
DB = connect() with connect() as db:
data = DB.query(""" data = db.query("""
with cte as ( with cte as (
select
publisher_id
,year(published_at) as year
,count(1) as stories
from stories
group by
publisher_id
,year(published_at)
) , agg as (
select select
publisher_id ps.publisher_id
,avg(stories) as stories_per_year ,year(s.published_at) as year
,case ,count(1) as stories
when avg(stories) < 2 then 2 from stories s
when avg(stories) < 4 then 4 join mbfc.publisher_stories ps
when avg(stories) < 8 then 8 on ps.story_id = s.id
when avg(stories) < 16 then 16
when avg(stories) < 32 then 32
when avg(stories) < 64 then 64
when avg(stories) < 128 then 128
else 129
end as max_avg
from cte
group by group by
publisher_id ps.publisher_id
) ,year(s.published_at)
select ) , agg as (
max_avg select
,count(1) as publishers publisher_id
from agg ,avg(stories) as stories_per_year
group by ,case
max_avg when avg(stories) < 2 then 2
""").df() when avg(stories) < 4 then 4
DB.close() when avg(stories) < 8 then 8
when avg(stories) < 16 then 16
when avg(stories) < 32 then 32
when avg(stories) < 64 then 64
when avg(stories) < 128 then 128
else 129
end as max_avg
from cte
group by
publisher_id
)
select
max_avg
,count(1) as publishers
from agg
group by
max_avg
""").df()
ax = sns.barplot(x=data.max_avg, y=data.publishers, color='tab:blue') ax = sns.barplot(x=data.max_avg, y=data.publishers, color='tab:blue')
ax.set(title="histogram of publisher stories per year", ylabel="count of publishers (#)", xlabel="max average stories / year") ax.set(title="histogram of publisher stories per year", ylabel="count of publishers (#)", xlabel="avg. stories / year", xticklabels=['2', '4', '8', '16', '32', '64', '128', '>128'])
plt.tight_layout() plt.tight_layout()
plt.savefig(out_dir / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved: {save_to}")
@click.command('plot:top-publishers') @click.command('plot:top-publishers')
def top_publishers(): def top_publishers():
"""plot top publishers over time""" """plot top publishers over time"""
filename = 'top_publishers.png' save_to = paths('figures') / 'top_publishers.png'
DB = connect() with connect() as db:
data = DB.query(""" db.query("""
select SELECT
p.tld p.tld
,year(published_at) as year ,p.id
,count(1) as stories FROM mbfc.publishers p
from ( JOIN mbfc.publisher_stories ps
select ON ps.publisher_id = p.id
JOIN stories s
ON s.id = ps.story_id
GROUP BY
p.tld
,p.id
order by count(1) desc
limit 20
""")
with connect() as db:
data = db.query("""
WITH p as (
SELECT
p.tld
,p.id
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
JOIN stories s
ON s.id = ps.story_id
GROUP BY
p.tld
,p.id
order by count(1) desc
limit 20
)
SELECT
p.tld p.tld
,p.id ,YEAR(s.published_at) AS year
from top.publishers p ,COUNT(1) AS stories
join top.stories s FROM stories s
on s.publisher_id = p.id JOIN mbfc.publisher_stories ps
group by ON ps.story_id = s.id
JOIN p
ON p.id = ps.publisher_id
GROUP by
p.tld p.tld
,p.id ,YEAR(published_at)
order by count(1) desc ORDER BY year, COUNT(DISTINCT s.id) DESC
limit 20 """).df()
) p
join top.stories s
on s.publisher_id = p.id
group by
p.tld
,year(published_at)
order by count(distinct s.id) desc
""").df()
DB.close()
pivot = data.pivot(columns='year', index='tld', values='stories') pivot = data.pivot(columns='year', index='tld', values='stories')
ax = sns.heatmap(pivot, cmap="crest") ax = sns.heatmap(pivot, cmap="crest")
ax.set(title="top 20 publishers (by tld)", ylabel="tld", xlabel="stories / year (#)") ax.set(title="top 20 publishers (by tld)", ylabel="tld", xlabel="stories / year (#)")
plt.tight_layout() plt.tight_layout()
plt.savefig(out_dir / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved: {save_to}")
@click.command('plot:common_tld') @click.command('plot:common_tld')
def common_tld(): def common_tld():
import dataframe_image as dfi import dataframe_image as dfi
filename = 'common_tld.png' save_to = paths('figures') / 'common_tld.png'
DB = connect() with connect() as db:
data = DB.query(""" data = db.query("""
select select
split_part(url, '.', -1) as tld split_part(url, '.', -1) as tld
,count(1) as publishers ,count(1) as publishers
,case when count(1) < 20 ,case when count(1) < 20
then string_agg(distinct url, '\t') then string_agg(distinct url, '\t')
else NULL else NULL
end as urls end as urls
from publishers from publishers
group by group by
split_part(url, '.', -1) split_part(url, '.', -1)
order by order by
count(1) desc count(1) desc
""").df() """).df()
DB.close() data[:15][['tld', 'publishers']].style.hide(axis="index").export_png(save_to, table_conversion='matplotlib')
data[:15][['tld', 'publishers']].style.hide(axis="index").export_png(out_dir / filename, table_conversion='matplotlib')
def stats(): def stats():
@ -246,7 +267,7 @@ def stats():
@click.command('plot:bias-stats') @click.command('plot:bias-stats')
def bias_stats(): def bias_stats():
import dataframe_image as dfi import dataframe_image as dfi
filename = 'bias_stats.png' save_to = paths('figures') / 'bias_stats.png'
DB = connect() DB = connect()
@ -300,3 +321,69 @@ def bias_stats():
""").df() """).df()
DB.close() DB.close()
print(df.to_markdown(index=False)) print(df.to_markdown(index=False))
@click.command('plot:bias-over-time')
def bias_over_time():
"""plot bias labels over time"""
save_to = paths('figures') / 'bias_over_time.png'
with connect() as db:
df = db.sql("""
SELECT
p.bias
,p.id
,date_trunc('year', s.published_at) as year
,count(1) as stories
FROM stories s
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
where year(s.published_at) not in (2006, 2023)
and p.ordinal != -1
GROUP BY
p.bias
,p.id
,p.ordinal
,date_trunc('year', s.published_at)
order by
p.ordinal
,date_trunc('year', s.published_at)
""").df()
ax = sns.relplot(df, kind='line', x='year', y='stories', col='bias', units='id', estimator=None, palette='rainbow')
ax.set(ylabel="stories", xlabel="year")
plt.tight_layout()
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
def bias_missing():
with connect() as db:
df = db.sql("""
SELECT
date_trunc('year', s.published_at) as year
,s.tld
,count(1) as stories
FROM stories s
LEFT JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
WHERE ps.publisher_id is NULL
AND year(s.published_at) not in (2006, 2023)
GROUP BY
s.tld
,date_trunc('year', s.published_at)
HAVING count(1) > 10
ORDER BY
date_trunc('year', s.published_at)
""").df()
ax = sns.lineplot(df, x='year', y='stories', units='tld', estimator=None)
ax.set(ylabel="stories", xlabel="year")
plt.tight_layout()
plt.show()
#plt.savefig(save_to)
plt.close()
#print(f"saved: {save_to}")

View File

@ -1,77 +1,79 @@
import click import click
from data.main import connect from data.main import connect, paths, ticklabels
import os import os
from pathlib import Path
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
import pandas as pd import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:emotion-over-time') @click.command('plot:emotion-over-time')
def emotion_over_time(): def emotion_over_time():
filename = "emotion_over_time.png"
DB = connect()
emotions = DB.sql(""" filename = "emotion_over_time.png"
SELECT save_to = paths('figures') / filename
date_trunc('year', s.published_at) AS year
,e.label AS emotion with connect() as db:
,count(1) AS stories emotions = db.sql("""
FROM top.stories s SELECT
JOIN story_emotions se date_trunc('year', s.published_at) AS year
ON s.id = se.story_id ,e.label AS emotion
JOIN emotions e ,count(1) AS stories
ON e.id = se.emotion_id FROM stories s
GROUP by JOIN story_emotions se
date_trunc('year', s.published_at) ON s.id = se.story_id
,e.label JOIN emotions e
""").df() ON e.id = se.emotion_id
DB.close() GROUP by
date_trunc('year', s.published_at)
,e.label
""").df()
ax = sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion']) ax = sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
ax.set(title="title emotions over years", xlabel="year", ylabel="stories (#)") ax.set(title="title emotions over years", xlabel="year", ylabel="stories (#)")
plt.savefig(out_path / filename) plt.savefig(save_to)
print(f"saved: {filename}") plt.close()
print(f"saved: {save_to}")
os.system(f'xdg-open {save_to}')
@click.command('plot:emotion-regression') @click.command('plot:emotion-regression')
def emotion_regression(): def emotion_regression():
"""plot emotion over time as regression"""
from sklearn import linear_model from sklearn import linear_model
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay from sklearn.metrics import ConfusionMatrixDisplay
filename = "emotion_regression.png" filename = "emotion_regression.png"
save_to = paths('figures') / filename
DB = connect() with connect() as db:
emotions = DB.query(""" #emotions = db.query("""
SELECT # SELECT
label # label
FROM emotions e # FROM emotions e
""").df()['label'].to_list() #""").df()['label'].to_list()
DB.close() df = db.sql(f"""
SELECT
DB = connect() epoch(date_trunc('yearweek', s.published_at)) AS date
df = DB.sql(f""" ,e.id AS emotion_id
SELECT ,p.id as publisher_id
epoch(date_trunc('yearweek', s.published_at)) AS date ,count(1) AS stories
,e.id AS emotion_id FROM stories s
,p.id as publisher_id JOIN mbfc.publisher_stories ps
,count(1) AS stories ON ps.story_id = s.id
FROM top.stories s JOIN mbfc.publishers p
JOIN top.publishers p ON p.id = ps.publisher_id
ON p.id = s.publisher_id JOIN story_emotions se
JOIN story_emotions se ON s.id = se.story_id
ON s.id = se.story_id JOIN emotions e
JOIN emotions e ON e.id = se.emotion_id
ON e.id = se.emotion_id WHERE p.ordinal != -1
GROUP by GROUP by
epoch(date_trunc('yearweek', s.published_at)) epoch(date_trunc('yearweek', s.published_at))
,p.id ,p.id
,e.id ,e.id
""").df() """).df()
DB.close()
results = [] results = []
for (emotion_id, publisher_id), group in df.groupby(['emotion_id', 'publisher_id']): for (emotion_id, publisher_id), group in df.groupby(['emotion_id', 'publisher_id']):
@ -83,77 +85,59 @@ def emotion_regression():
results.append({'emotion_id' : emotion_id, 'publisher_id':publisher_id, 'per_year' : per_year}) results.append({'emotion_id' : emotion_id, 'publisher_id':publisher_id, 'per_year' : per_year})
results = pd.DataFrame(results) results = pd.DataFrame(results)
DB = connect() with connect() as db:
out = DB.query(""" out = db.query("""
SELECT SELECT
e.label as emotion e.label as emotion
--,p.tld ,avg(results.per_year) as avg_reg_coef
,avg(results.per_year) as avg_reg_coef ,p.bias
,b.ordinal FROM results
FROM results JOIN emotions e
JOIN emotions e ON e.id = results.emotion_id
ON e.id = results.emotion_id JOIN mbfc.publishers p
JOIN top.publishers p ON p.id = results.publisher_id
ON p.id = results.publisher_id GROUP BY
JOIN publisher_bias pb e.label
ON pb.publisher_id = results.publisher_id ,p.bias
JOIN bias_ratings b """).df()
ON b.id = pb.bias_id
GROUP BY
e.label
,b.ordinal
""").df()
DB.close()
pivot = out.pivot(index=['emotion'], columns=['ordinal'], values=['avg_reg_coef'])
ax = sns.heatmap(pivot, cmap='RdBu_r') pivot = out.pivot(index=['emotion'], columns=['bias'], values=['avg_reg_coef'])
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax = sns.heatmap(pivot, cmap='BrBG', vmin=-0.01, vmax=0.01, center=0)
#ax = sns.heatmap(pivot, cmap='RdBu_r', center=0)
ax.set(title="slope of regression (stories/year) by bias and emotion" ax.set(title="slope of regression (stories/year) by bias and emotion"
,xticklabels=ticklabels ,xticklabels=ticklabels()
,xlabel="bias" ,xlabel="bias"
,ylabel="emotion") ,ylabel="emotion")
plt.tight_layout() plt.tight_layout()
plt.savefig(out_path / filename) plt.savefig(save_to)
print(f"saved: {filename}") plt.close()
print(f"saved: {save_to}")
@click.command('plot:emotion-hist') @click.command('plot:emotion-hist')
def emotion_hist(): def emotion_hist():
filename = "emotion_hist.png" filename = "emotion_hist.png"
save_to = paths('figures') / filename
DB = connect() with connect() as db:
DB.query("""describe story_emotions""") data = db.sql("""
SELECT
p.bias
,count(1) as stories
FROM stories s
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
WHERE p.ordinal != -1
GROUP BY
p.bias
""").df()
DB.query(""" ax = sns.barplot(data, x='bias', y='stories', palette='rainbow', order=ticklabels())
select ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
e.label
,count(distinct s.id) as stories
,count(distinct s.publisher_id) as publishers
from story_emotions se
join emotions e
on e.id = se.emotion_id
join top.stories s
on s.id = se.story_id
group by
e.label
""").df().to_markdown(index=False)
data = DB.sql("""
SELECT
b.ordinal
,count(1) as stories
FROM stories s
JOIN publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
GROUP BY
b.ordinal
""").df()
DB.close()
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
plt.tight_layout() plt.tight_layout()
plt.savefig(out_path / filename) plt.savefig(save_to)
print(f"saved: {filename}") plt.close()
print(f"saved: {save_to}")

View File

@ -9,20 +9,20 @@ import numpy as np
from sklearn.metrics import silhouette_score from sklearn.metrics import silhouette_score
import pandas as pd import pandas as pd
out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:link-elbow') @click.command('plot:link-elbow')
def elbow(): def elbow():
from sklearn.cluster import KMeans from sklearn.cluster import KMeans
filename = 'link_cluster_elbow.png' save_to = paths('figures') / 'link_cluster_elbow.png'
with connect() as db:
df = db.query("""
SELECT
*
FROM link_edges
""").df()
DB = connect()
df = DB.query("""
SELECT
*
FROM link_edges
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0) pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
to_plot = [] to_plot = []
@ -36,8 +36,9 @@ def elbow():
ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia) ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia)
ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points") ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
plt.savefig(out_dir / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved plot: {save_to}")
# randomly pick 8 # randomly pick 8
@ -45,72 +46,65 @@ def elbow():
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links') @click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def link_pca_clusters(source): def link_pca_clusters(source):
filename = f"link_pca_clusters_{source}.png" save_to = paths('figures') / f"link_pca_clusters_{source}.png"
DB = connect() with connect() as db:
df = DB.query(f""" df = db.query(f"""
SELECT SELECT
c.label as cluster c.label as cluster
,p.tld ,p.tld
--,b.label as bias --,b.label as bias
,pca.first ,pca.first
,pca.second ,pca.second
,s.cnt as stories ,s.cnt as stories
FROM top.publisher_clusters_{source} c FROM top.publisher_clusters_{source} c
JOIN top.publishers p JOIN top.publishers p
ON c.publisher_id = p.id ON c.publisher_id = p.id
JOIN JOIN
( (
select select
s.publisher_id s.publisher_id
,count(1) as cnt ,count(1) as cnt
FROM top.stories s FROM top.stories s
GROUP BY GROUP BY
s.publisher_id s.publisher_id
) s ) s
ON s.publisher_id = p.id ON s.publisher_id = p.id
JOIN top.publisher_pca_{source} pca JOIN top.publisher_pca_{source} pca
ON pca.publisher_id = p.id ON pca.publisher_id = p.id
""").df() """).df()
DB.close()
ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster']) ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster'])
ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component") ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component")
plt.savefig(out_dir / filename) plt.savefig(save_to)
print(f"saved plot: {save_to}")
# .df().groupby(['cluster', 'bias']).describe()
def test(): def test():
data_dir = Path(os.getenv('DATA_MINING_DATA_DIR'))
DB.query(""" with connect() as db:
SELECT db.query("""
p.id as publisher_id SELECT
,p.name p.id as publisher_id
,p.tld ,p.name
,cast(b.bias_id as int) as bias_id ,p.tld
,count(1) as stories ,cast(b.bias_id as int) as bias_id
FROM publishers p ,count(1) as stories
JOIN stories s FROM publishers p
ON s.publisher_id = p.id JOIN stories s
JOIN publisher_clusters c ON s.publisher_id = p.id
ON c.publisher_id = p.id JOIN publisher_clusters c
LEFT JOIN publisher_bias b ON c.publisher_id = p.id
ON b.publisher_id = p.id LEFT JOIN publisher_bias b
where bias_id is null ON b.publisher_id = p.id
group by where bias_id is null
p.id group by
,p.name p.id
,p.tld ,p.name
,b.bias_id ,p.tld
ORDER BY count(1) desc ,b.bias_id
""") ORDER BY count(1) desc
""")
# .df().to_csv(data_dir / 'cluster_publishers.csv', sep="|", index=False)
DB.close()
@click.command('plot:link-confusion') @click.command('plot:link-confusion')
@ -120,34 +114,36 @@ def link_confusion():
from sklearn.metrics import ConfusionMatrixDisplay from sklearn.metrics import ConfusionMatrixDisplay
filename = "link_confusion.png" filename = "link_confusion.png"
save_to = paths('figures') / filename
DB = connect() with connect() as db:
bias = DB.query(""" bias = db.query("""
SELECT SELECT
p.id as publisher_id p.id as publisher_id
,b.ordinal ,b.ordinal
FROM top.publishers p FROM top.publishers p
JOIN top.publisher_bias pb JOIN top.publisher_bias pb
ON pb.publisher_id = p.id ON pb.publisher_id = p.id
JOIN bias_ratings b JOIN bias_ratings b
ON b.id = pb.bias_id ON b.id = pb.bias_id
""").df() """).df()
df = db.query("""
SELECT
*
FROM top.link_edges
WHERE parent_id in (
select
publisher_id
from bias
)
AND child_id in (
select
publisher_id
from bias
)
""").df()
df = DB.query("""
SELECT
*
FROM top.link_edges
WHERE parent_id in (
select
publisher_id
from bias
)
AND child_id in (
select
publisher_id
from bias
)
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0) pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
x = pivot.values x = pivot.values
@ -166,9 +162,9 @@ def link_confusion():
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax) ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels) ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
plt.savefig(out_dir / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved plot: {filename}") print(f"saved plot: {save_to}")
@click.command('plot:link-classifier') @click.command('plot:link-classifier')
def link_confusion(): def link_confusion():
@ -176,49 +172,51 @@ def link_confusion():
from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay from sklearn.metrics import ConfusionMatrixDisplay
filename = "link_confusion.png" save_to = paths('figures') / "link_confusion.png"
DB = connect() with connect() as db:
bias = DB.query(""" bias = db.query("""
SELECT SELECT
p.id as publisher_id p.id as publisher_id
,b.ordinal ,b.ordinal
FROM top.publishers p FROM top.publishers p
JOIN top.publisher_bias pb JOIN top.publisher_bias pb
ON pb.publisher_id = p.id ON pb.publisher_id = p.id
JOIN bias_ratings b JOIN bias_ratings b
ON b.id = pb.bias_id ON b.id = pb.bias_id
""").df() """).df()
df = db.query("""
SELECT
*
FROM top.link_edges
WHERE parent_id in (
select
publisher_id
from bias
)
AND child_id in (
select
publisher_id
from bias
)
""").df()
df = DB.query("""
SELECT
*
FROM top.link_edges
WHERE parent_id in (
select
publisher_id
from bias
)
AND child_id in (
select
publisher_id
from bias
)
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0) pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
x = pivot.values x = pivot.values
y = bias.sort_values('publisher_id').ordinal y = bias.sort_values('publisher_id').ordinal
data = DB.query(f""" with connect() as db:
SELECT data = db.query(f"""
p.id as publisher_id SELECT
,pca.first p.id as publisher_id
,pca.second ,pca.first
FROM top.publisher_pca_onehot pca ,pca.second
JOIN top.publishers p FROM top.publisher_pca_onehot pca
ON pca.publisher_id = p.id JOIN top.publishers p
""").df() ON pca.publisher_id = p.id
""").df()
@ -235,11 +233,11 @@ def link_confusion():
ConfusionMatrixDisplay.from_predictions(data['ordinal'], data['pred'], ax=ax) ConfusionMatrixDisplay.from_predictions(data['ordinal'], data['pred'], ax=ax)
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="confusion matrix for link matrix kNN classifier", xticklabels=ticklabels, yticklabels=ticklabels) ax.set(title="confusion matrix for link matrix kNN classifier", xticklabels=ticklabels, yticklabels=ticklabels)
plt.savefig(out_dir / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved plot: {filename}") print(f"saved plot: {save_to}")
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred']) # ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
plt.savefig(out_dir / filename) # plt.savefig(out_dir / filename)
plt.close() # plt.close()
print(f"saved plot: {filename}") # print(f"saved plot: {filename}")

View File

@ -1,5 +1,5 @@
import click import click
from data.main import connect from data.main import connect, paths
import os import os
from pathlib import Path from pathlib import Path
import seaborn as sns import seaborn as sns
@ -7,57 +7,52 @@ import matplotlib.pyplot as plt
import numpy as np import numpy as np
import pandas as pd import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
@click.command('plot:sentence-pca') @click.command('plot:sentence-pca')
def sentence_pca(): def sentence_pca():
filename = "embedding_sentence_pca.png" save_to = paths('figures') / "embedding_sentence_pca.png"
DB = connect()
data = DB.query(""" with connect() as db:
SELECT data = db.query("""
pca.first SELECT
,pca.second pca.first
,b.bias as label ,pca.second
FROM top.story_embeddings_pca pca ,b.bias as label
JOIN top.stories s FROM top.story_embeddings_pca pca
ON s.id = pca.story_id JOIN top.stories s
JOIN top.publisher_bias pb ON s.id = pca.story_id
ON pb.publisher_id = s.publisher_id JOIN top.publisher_bias pb
JOIN bias_ratings b ON pb.publisher_id = s.publisher_id
ON b.id = pb.bias_id JOIN bias_ratings b
""").df() ON b.id = pb.bias_id
DB.close() """).df()
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label']) ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
ax.set(title="pca components vs. bias label", xlabel="first component", ylabel="second component") ax.set(title="pca components vs. bias label", xlabel="first component", ylabel="second component")
plt.savefig(out_path / filename) plt.savefig(save_to)
@click.command('plot:avg-sentence-pca') @click.command('plot:avg-sentence-pca')
def avg_sentence_pca(): def avg_sentence_pca():
filename = "avg_embedding_sentence_pca.png" save_to = paths('figures') / "avg_embedding_sentence_pca.png"
DB = connect()
data = DB.query(""" with connect() as db:
SELECT data = db.query("""
pca.first SELECT
,pca.second pca.first
,p.tld ,pca.second
,b.bias as label ,p.tld
FROM top.publisher_embeddings_pca pca ,b.bias as label
JOIN top.publishers p FROM top.publisher_embeddings_pca pca
ON p.id = pca.publisher_id JOIN top.publishers p
JOIN top.publisher_bias pb ON p.id = pca.publisher_id
ON pb.publisher_id = p.id JOIN top.publisher_bias pb
JOIN bias_ratings b ON pb.publisher_id = p.id
ON b.id = pb.bias_id JOIN bias_ratings b
""").df() ON b.id = pb.bias_id
DB.close() """).df()
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label']) ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
ax.set(title="avg. publisher embedding pca components vs. bias label", xlabel="first component", ylabel="second component") ax.set(title="avg. publisher embedding pca components vs. bias label", xlabel="first component", ylabel="second component")
plt.savefig(out_path / filename) plt.savefig(save_to)
@click.command('plot:sentence-confusion') @click.command('plot:sentence-confusion')
def sentence_confusion(): def sentence_confusion():
@ -65,32 +60,31 @@ def sentence_confusion():
from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay from sklearn.metrics import ConfusionMatrixDisplay
filename = "sentence_confusion.png" save_to = paths('figures') / "sentence_confusion.png"
embeddings = np.load(data_path / 'embeddings.npy') embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = np.load(data_path / 'embedding_ids.npy') embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index() ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
DB = connect() with connect() as db:
data = DB.query(""" data = db.query("""
SELECT SELECT
ids.index ids.index
,s.id ,s.id
,b.ordinal ,b.ordinal
FROM ids FROM ids
JOIN top.stories s JOIN top.stories s
ON ids.story_id = s.id ON ids.story_id = s.id
JOIN top.publisher_bias pb JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b JOIN bias_ratings b
ON b.id = pb.bias_id ON b.id = pb.bias_id
""").df() """).df()
pub = DB.query(""" pub = db.query("""
SELECT SELECT
* *
FROM top.publishers FROM top.publishers
""").df() """).df()
DB.close()
train, test = train_test_split(data) train, test = train_test_split(data)
train_x, train_y = embeddings[train['index']], train['ordinal'] train_x, train_y = embeddings[train['index']], train['ordinal']
@ -105,7 +99,7 @@ def sentence_confusion():
ConfusionMatrixDisplay.from_predictions(test_y, pred, ax=ax) ConfusionMatrixDisplay.from_predictions(test_y, pred, ax=ax)
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels) ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
plt.savefig(out_path / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved plot: {filename}") print(f"saved plot: {save_to}")

View File

@ -1,138 +1,135 @@
import click import click
from data.main import connect from data.main import connect, paths, ticklabels
import os
from pathlib import Path
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:sentiment-over-time') @click.command('plot:sentiment-over-time')
def over_time(): def over_time():
filename = "sentiment_over_time.png"
DB = connect() filename = "sentiment_over_time.png"
data = DB.sql(""" save_to = paths('figures') / filename
SELECT
avg(sent.class_id) as sentiment with connect() as db:
,s.published_at as date data = db.sql("""
FROM top.story_sentiments sent SELECT
JOIN top.stories s avg(sent.class_id) as sentiment
ON s.id = sent.story_id ,s.published_at as date
GROUP BY FROM top.story_sentiments sent
s.published_at JOIN top.stories s
""").df() ON s.id = sent.story_id
DB.close() GROUP BY
s.published_at
""").df()
ax = sns.scatterplot(x=data['date'], y=data['sentiment']) ax = sns.scatterplot(x=data['date'], y=data['sentiment'])
ax.set(title="sentiment vs. time") ax.set(title="sentiment vs. time")
plt.tight_layout() plt.tight_layout()
plt.savefig(out_path / filename) plt.savefig(save_to)
print(f"saved: {filename}") plt.close()
print(f"saved: {save_to}")
@click.command('plot:bias-vs-sentiment-over-time') @click.command('plot:bias-vs-sentiment-over-time')
def bias_over_time(): def bias_over_time():
"""plot sentiment/bias vs. time"""
filename = "bias_vs_sentiment_over_time.png" filename = "bias_vs_sentiment_over_time.png"
save_to = paths('figures') / filename
DB = connect() with connect() as db:
data = DB.sql(""" data = db.sql("""
SELECT with cte as (
avg(sent.class_id) as sentiment SELECT
,date_trunc('yearweek', s.published_at) as date avg(sent.class_id) as sentiment
--,b.ordinal as ordinal ,date_trunc('yearweek', s.published_at) as date
,b.bias ,p.bias
FROM top.story_sentiments sent FROM story_sentiments sent
JOIN top.stories s JOIN stories s
ON s.id = sent.story_id ON s.id = sent.story_id
JOIN publisher_bias pb JOIN mbfc.publisher_stories ps
ON pb.publisher_id = s.publisher_id ON ps.story_id = s.id
JOIN bias_ratings b JOIN mbfc.publishers p
ON b.id = pb.bias_id ON p.id = ps.publisher_id
GROUP BY WHERE p.ordinal != -1
date_trunc('yearweek', s.published_at) GROUP BY
,b.bias date_trunc('yearweek', s.published_at)
""").df() ,p.bias
DB.close() )
SELECT
median(sentiment) OVER (PARTITION BY bias ORDER BY date DESC ROWS BETWEEN 0 PRECEDING AND 7 FOLLOWING) as sentiment
,date
,bias
FROM cte
WHERE year(date) not in (2005, 2023)
""").df()
order = ['left', 'left-center', 'center', 'right-center', 'right'] #ax = sns.relplot(data, x='date', y='sentiment', col='bias', palette='rainbow', hue='bias', col_order=ticklabels())
ax = sns.relplot(data, x='date', y='sentiment', col='bias', col_order=order) ax = sns.lineplot(data, x='date', y='sentiment', palette='rainbow', hue='bias', hue_order=ticklabels())
plt.axhline(y=0.5, color='black', linestyle='--', label='neutral')
ax.set(title='sentiment and bias vs. time', ylabel='8 week rolling avg. sentiment', xlabel='date')
plt.tight_layout() plt.tight_layout()
plt.savefig(out_path / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved: {filename}") print(f"saved: {save_to}")
@click.command('plot:sentiment-recent-winner') @click.command('plot:sentiment-recent-winner')
def bias_vs_recent_winner(): def bias_vs_recent_winner():
"""plot bias vs. distance to election"""
filename = "bias_vs_recent_winner.png" filename = "bias_vs_recent_winner.png"
save_to = paths('figures') / filename
DB = connect() with connect() as db:
data = DB.sql(""" data = db.sql("""
SELECT SELECT
e.days_away as days_away round(e.days_away, -1) as days_away
,b.ordinal ,p.bias
,avg(sent.class_id) as sentiment ,avg(sent.class_id) as sentiment
,count(1) as stories ,count(1) as stories
FROM top.stories s FROM stories s
JOIN top.story_sentiments sent JOIN story_sentiments sent
ON s.id = sent.story_id ON s.id = sent.story_id
JOIN election_distance e JOIN election_distance e
ON e.publish_date = s.published_at ON e.publish_date = s.published_at
JOIN publisher_bias pb JOIN mbfc.publisher_stories ps
ON pb.publisher_id = s.publisher_id ON ps.story_id = s.id
JOIN bias_ratings b JOIN mbfc.publishers p
ON b.id = pb.bias_id ON p.id = ps.publisher_id
GROUP BY GROUP BY
e.days_away round(e.days_away, -1)
,b.ordinal ,p.bias
""").df() """).df()
DB.close()
data
ax = sns.scatterplot(x=data['days_away'], y=data['sentiment'], hue=data['ordinal']) ax = sns.scatterplot(data, x='days_away', y='sentiment', hue='bias', hue_order=ticklabels(), palette='rainbow')
ax.set(title="sentiment vs. days to nearest election", xlabel="days to nearest election", ylabel="average title seniment") ax.set(title="sentiment vs. days to nearest election", xlabel="days to nearest election", ylabel="average title seniment")
plt.tight_layout() plt.tight_layout()
plt.savefig(out_path / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved: {save_to}")
print(f"saved: {filename}")
@click.command('plot:sentiment-hist') @click.command('plot:sentiment-hist')
def sentiment_hist(): def sentiment_hist():
filename = "sentiment_hist.png" filename = "sentiment_hist.png"
save_to = paths('figures') / filename
DB = connect() with connect() as db:
data = db.sql("""
SELECT
p.bias
,count(1) as stories
FROM stories s
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
WHERE p.ordinal != -1
GROUP BY
p.bias
""").df()
DB.query(""" ax = sns.barplot(data, x='bias', y='stories', hue='bias', palette='rainbow')
select ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
sent.label
,count(distinct s.id) as stories
,count(distinct s.publisher_id) as publishers
from top.story_sentiments sent
join top.stories s
on s.id = sent.story_id
group by
sent.label
""").df().to_markdown(index=False)
data = DB.sql("""
SELECT
b.ordinal
,count(1) as stories
FROM stories s
JOIN publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
GROUP BY
b.ordinal
""").df()
DB.close()
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
plt.tight_layout() plt.tight_layout()
plt.savefig(out_path / filename) plt.savefig(save_to)
print(f"saved: {filename}") plt.close()
print(f"saved: {save_to}")

View File

@ -1,48 +0,0 @@
from data.main import connect
import pandas as pd
import numpy as np
DB = connect()
edges = DB.query("""
select
*
from link_edges
""").df()
DB.close()
edges
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
DB = connect()
DB.query("create schema top")
DB.query("""
CREATE OR REPLACE TABLE top.publishers AS
SELECT
p.*
FROM publishers p
JOIN select_publishers s
ON s.publisher_id = p.id
""")
DB.query("""
CREATE OR REPLACE TABLE top.stories AS
SELECT
s.*
FROM stories s
JOIN top.publishers p
ON s.publisher_id = p.id
WHERE year(s.published_at) >= 2006
AND year(s.published_at) < 2023
""")
DB.query("""
CREATE OR REPLACE TABLE top.related_stories AS
SELECT
r.*
FROM top.stories s
JOIN related_stories r
ON s.id = r.parent_id
""")

View File

@ -1,7 +1,7 @@
from transformers import AutoTokenizer, AutoModel from transformers import AutoTokenizer, AutoModel
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from data.main import connect, data_dir from data.main import connect, paths
import os import os
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
@ -62,7 +62,7 @@ def embed(chunks):
ids = np.concatenate(embedding_ids) ids = np.concatenate(embedding_ids)
# save embeddings # save embeddings
save_to = data_dir() / 'embeddings.npy' save_to = paths('data') / 'embeddings.npy'
np.save(save_to, embeddings) np.save(save_to, embeddings)
print(f"embeddings saved: {save_to}") print(f"embeddings saved: {save_to}")
@ -75,29 +75,28 @@ def embed(chunks):
@click.command('sentence:create-avg-pca-table') @click.command('sentence:create-avg-pca-table')
def create_avg_pca_table(): def create_avg_pca_table():
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
embeddings = np.load(data_path / 'embeddings.npy') embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = np.load(data_path / 'embedding_ids.npy') embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index() ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
DB = connect()
data = DB.query(""" with connect() as db:
SELECT data = db.query("""
ids.index SELECT
,s.id ids.index
,s.publisher_id ,s.id
,b.ordinal ,s.publisher_id
FROM ids ,b.ordinal
JOIN top.stories s FROM ids
ON ids.story_id = s.id JOIN top.stories s
JOIN top.publisher_bias pb ON ids.story_id = s.id
ON pb.publisher_id = s.publisher_id JOIN top.publisher_bias pb
JOIN bias_ratings b ON pb.publisher_id = s.publisher_id
ON b.id = pb.bias_id JOIN bias_ratings b
""").df() ON b.id = pb.bias_id
DB.close() """).df()
results = [] results = []
for publisher_id, group in data.groupby(['publisher_id']): for publisher_id, group in data.groupby(['publisher_id']):
@ -115,47 +114,45 @@ def create_avg_pca_table():
results['second'] = pred[:, 1] results['second'] = pred[:, 1]
table_name = "top.publisher_embeddings_pca" table_name = "top.publisher_embeddings_pca"
DB = connect() with connect() as db:
DB.query(f""" db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS CREATE OR REPLACE TABLE {table_name} AS
SELECT SELECT
results.publisher_id as publisher_id results.publisher_id as publisher_id
,results.first as first ,results.first as first
,results.second as second ,results.second as second
FROM results FROM results
""") """)
DB.close()
print(f"created {table_name}") print(f"created {table_name}")
@click.command('sentence:create-pca-table') @click.command('sentence:create-pca-table')
def create_pca_table(): def create_pca_table():
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
embeddings = np.load(data_path / 'embeddings.npy') embeddings = np.load(path('data') / 'embeddings.npy')
embedding_ids = np.load(data_path / 'embedding_ids.npy') embedding_ids = np.load(path('data') / 'embedding_ids.npy')
DB = connect() with connect() as db:
data = DB.query(""" data = db.query("""
SELECT SELECT
ids.index ids.index
,s.id ,s.id
,b.ordinal ,b.ordinal
FROM ids FROM ids
JOIN top.stories s JOIN top.stories s
ON ids.story_id = s.id ON ids.story_id = s.id
JOIN top.publisher_bias pb JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b JOIN bias_ratings b
ON b.id = pb.bias_id ON b.id = pb.bias_id
""").df() """).df()
pub = DB.query(""" pub = db.query("""
SELECT SELECT
* *
FROM top.publishers FROM top.publishers
""").df() """).df()
DB.close()
x = embeddings[data['index']] x = embeddings[data['index']]
y = data['ordinal'].to_numpy().reshape(-1, 1) y = data['ordinal'].to_numpy().reshape(-1, 1)
@ -166,42 +163,41 @@ def create_pca_table():
table_name = f"top.story_embeddings_pca" table_name = f"top.story_embeddings_pca"
DB = connect() with connect() as db:
DB.query(f""" db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS CREATE OR REPLACE TABLE {table_name} AS
SELECT SELECT
data.id as story_id data.id as story_id
,data.first as first ,data.first as first
,data.second as second ,data.second as second
FROM data FROM data
""") """)
DB.close()
print(f"created {table_name}") print(f"created {table_name}")
@click.command('sentence:create-svm-table') @click.command('sentence:create-svm-table')
def create_svm_table(): def create_svm_table():
from sklearn import svm from sklearn import svm
from sklearn.linear_model import SGDClassifier from sklearn.linear_model import SGDClassifier
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
embeddings = np.load(data_path / 'embeddings.npy') embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = np.load(data_path / 'embedding_ids.npy') embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index() ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
DB = connect() with connect() as db:
data = DB.query(""" data = db.query("""
SELECT SELECT
ids.index ids.index
,s.id ,s.id
,b.ordinal ,b.ordinal
FROM ids FROM ids
JOIN top.stories s JOIN top.stories s
ON ids.story_id = s.id ON ids.story_id = s.id
JOIN top.publisher_bias pb JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b JOIN bias_ratings b
ON b.id = pb.bias_id ON b.id = pb.bias_id
""").df() """).df()
x = embeddings[data['index']] x = embeddings[data['index']]
#y = data['ordinal'].to_numpy().reshape(-1, 1) #y = data['ordinal'].to_numpy().reshape(-1, 1)