add mbfc data. use context manager for db. add paths fn.

This commit is contained in:
matt 2023-06-01 09:44:01 -07:00
parent 398228f02c
commit 79808f01d6
40 changed files with 1354 additions and 1137 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 21 KiB

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 148 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 235 KiB

After

Width:  |  Height:  |  Size: 73 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 202 KiB

After

Width:  |  Height:  |  Size: 104 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 44 KiB

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 30 KiB

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 87 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 51 KiB

After

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 128 KiB

After

Width:  |  Height:  |  Size: 128 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 22 KiB

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB

After

Width:  |  Height:  |  Size: 53 KiB

27
src/apriori.py Normal file
View File

@ -0,0 +1,27 @@
from efficient_apriori import apriori
from data.main import connect
@click.command("apriori:rules")
def rules():
DB = connect()
data = DB.query("""
SELECT
--list_prepend(parent.id, list(child.id)) as transaction
list_prepend(parent.tld, list(child.tld)) as transaction
FROM stories s
JOIN related_stories r
ON r.parent_id = s.id
JOIN publishers parent
ON parent.id = s.publisher_id
JOIN publishers child
ON child.id = r.publisher_id
GROUP BY
--parent.id
parent.tld
""").df()
DB.close()
transactions = data.transaction.apply(lambda x: tuple(x)).values
itemsets, rules = apriori(transactions, min_support=0.1, min_confidence=0.8)
print(*rules, sep="\n")

View File

@ -1,41 +1,16 @@
import click import click
from data.main import connect from data.main import connect, paths
import pandas as pd import pandas as pd
from lxml import etree from lxml import etree
from pathlib import Path from pathlib import Path
import os import os
import csv import csv
def label_to_int(rating:str) -> int:
mapping = {
'left' : 0,
'left-center' : 1,
'center' : 2,
'right-center' : 3,
'right' : 4,
'allsides' : -1,
}
return mapping[rating]
def int_to_label(class_id: int) -> str:
mapping = {
0 : 'left',
1 : 'left-center',
2 : 'center',
3 : 'right-center',
4 : 'right',
-1 : 'allsides',
}
return mapping[class_id]
@click.command(name="bias:normalize") @click.command(name="bias:normalize")
def normalize() -> None: def normalize() -> None:
DB = connect() with connect() as db:
db.sql("""
DB.sql("""
CREATE OR REPLACE TABLE publisher_bias AS CREATE OR REPLACE TABLE publisher_bias AS
WITH cte AS ( WITH cte AS (
SELECT SELECT
@ -72,9 +47,9 @@ def normalize() -> None:
] ]
mapping = pd.DataFrame(mapping) mapping = pd.DataFrame(mapping)
DB.query("alter table bias_ratings add column ordinal int") with connect() as db:
db.query("alter table bias_ratings add column ordinal int")
DB.query(""" db.query("""
update bias_ratings b update bias_ratings b
set ordinal = o.ordinal set ordinal = o.ordinal
FROM mapping o FROM mapping o
@ -85,9 +60,7 @@ def normalize() -> None:
@click.command(name='bias:parse') @click.command(name='bias:parse')
def parse() -> None: def parse() -> None:
"""parse the save html page of allslides.com bias ratings into a normalized csv file""" """parse the save html page of allslides.com bias ratings into a normalized csv file"""
DB = connect() bias_html = paths('data') / 'allsides.html'
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
bias_html = DATA_DIR / 'allsides.html'
parser = etree.HTMLParser() parser = etree.HTMLParser()
tree = etree.parse(str(bias_html), parser) tree = etree.parse(str(bias_html), parser)
@ -111,15 +84,14 @@ def parse() -> None:
rating['disagree'] = int(disagree) rating['disagree'] = int(disagree)
ratings.append(rating) ratings.append(rating)
df = pd.DataFrame(ratings) df = pd.DataFrame(ratings)
df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC) df.to_csv(paths('data') / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
@click.command(name="bias:load") @click.command(name="bias:load")
def load() -> None: def load() -> None:
DB = connect() f = str(paths('data') / "bias_ratings.csv")
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
f = str(DATA_DIR / "bias_ratings.csv")
DB.sql(f""" with connect() as db:
db.sql(f"""
CREATE TABLE bias_ratings as CREATE TABLE bias_ratings as
select select
row_number() over(order by b.publisher) as id row_number() over(order by b.publisher) as id
@ -129,10 +101,8 @@ def load() -> None:
@click.command('bias:export') @click.command('bias:export')
def export(): def export():
data_path = Path(os.environ['DATA_MINING_DATA_DIR']) with connect() as db:
all_bias = db.query("""
DB = connect()
all_bias = DB.query("""
SELECT SELECT
id as bias_id id as bias_id
,publisher as name ,publisher as name
@ -140,8 +110,10 @@ def export():
FROM bias_ratings FROM bias_ratings
ORDER by agree desc ORDER by agree desc
""") """)
all_bias.df().to_csv(data_path / 'TMP_publisher_bias.csv', sep="|", index=False)
mapped_bias = DB.query(""" all_bias.df().to_csv(paths('data') / 'TMP_publisher_bias.csv', sep="|", index=False)
with connect() as db:
mapped_bias = db.query("""
SELECT SELECT
p.id as publisher_id p.id as publisher_id
,p.name as name ,p.name as name
@ -152,18 +124,16 @@ def export():
LEFT JOIN publisher_bias b LEFT JOIN publisher_bias b
ON b.publisher_id = p.id ON b.publisher_id = p.id
""") """)
mapped_bias.df().to_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|", index=False) mapped_bias.df().to_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|", index=False)
DB.close()
@click.command('bias:import-mapped') @click.command('bias:import-mapped')
def import_mapped(): def import_mapped():
data_path = Path(os.environ['DATA_MINING_DATA_DIR'])
table_name = "top.publisher_bias" table_name = "top.publisher_bias"
DB = connect() df = pd.read_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|")
df = pd.read_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|")
DB.query(f""" with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS CREATE OR REPLACE TABLE {table_name} AS
SELECT SELECT
publisher_id AS publisher_id publisher_id AS publisher_id
@ -171,5 +141,6 @@ def import_mapped():
FROM df FROM df
WHERE bias_id IS NOT NULL WHERE bias_id IS NOT NULL
""") """)
print(f"created table: {table_name}") print(f"created table: {table_name}")

View File

@ -1,5 +1,7 @@
import click import click
from dotenv import load_dotenv from dotenv import load_dotenv
import data
import plots
@click.group() @click.group()
def cli(): def cli():
@ -7,12 +9,20 @@ def cli():
if __name__ == "__main__": if __name__ == "__main__":
load_dotenv() load_dotenv()
from data import scrape
cli.add_command(scrape.download) # original bias ratings
cli.add_command(scrape.parse) cli.add_command(data.scrape.download)
cli.add_command(scrape.load) cli.add_command(data.scrape.parse)
cli.add_command(scrape.normalize) cli.add_command(data.scrape.load)
cli.add_command(scrape.create_elections_table) cli.add_command(data.scrape.normalize)
cli.add_command(data.scrape.create_elections_table)
cli.add_command(data.factcheck.parse_index)
cli.add_command(data.factcheck.scrape)
cli.add_command(data.links.create_table)
cli.add_command(data.links.create_pca)
cli.add_command(data.links.create_clusters)
import word import word
# cli.add_command(word.distance) # cli.add_command(word.distance)
@ -23,10 +33,12 @@ if __name__ == "__main__":
cli.add_command(bias.parse) cli.add_command(bias.parse)
cli.add_command(bias.load) cli.add_command(bias.load)
cli.add_command(bias.normalize) cli.add_command(bias.normalize)
import mine import mine
cli.add_command(mine.embeddings) cli.add_command(mine.embeddings)
cli.add_command(mine.cluster) cli.add_command(mine.cluster)
cli.add_command(mine.plot) cli.add_command(mine.plot)
import emotion import emotion
cli.add_command(emotion.extract) cli.add_command(emotion.extract)
cli.add_command(emotion.normalize) cli.add_command(emotion.normalize)
@ -40,34 +52,20 @@ if __name__ == "__main__":
from train import main as train_main from train import main as train_main
cli.add_command(train_main.main) cli.add_command(train_main.main)
import plots.descriptive as plotd cli.add_command(plots.descriptive.articles_per_year)
cli.add_command(plotd.articles_per_year) cli.add_command(plots.descriptive.distinct_publishers)
cli.add_command(plotd.distinct_publishers) cli.add_command(plots.descriptive.stories_per_publisher)
cli.add_command(plotd.stories_per_publisher) cli.add_command(plots.descriptive.top_publishers)
cli.add_command(plotd.top_publishers) cli.add_command(plots.descriptive.common_tld)
cli.add_command(plotd.common_tld)
import links as linkcli
cli.add_command(linkcli.create_table)
cli.add_command(linkcli.create_pca)
cli.add_command(linkcli.create_clusters)
import plots.links as plotl
cli.add_command(plotl.elbow)
cli.add_command(plotl.link_pca_clusters)
import plots.classifier as plotc
cli.add_command(plotc.pca_with_classes)
import plots
cli.add_command(plots.sentence.sentence_pca) cli.add_command(plots.sentence.sentence_pca)
cli.add_command(plots.sentence.avg_sentence_pca) cli.add_command(plots.sentence.avg_sentence_pca)
cli.add_command(plots.emotion.emotion_over_time) cli.add_command(plots.emotion.emotion_over_time)
cli.add_command(plots.emotion.emotion_regression) cli.add_command(plots.emotion.emotion_regression)
cli.add_command(plots.sentiment.over_time) cli.add_command(plots.sentiment.over_time)
cli.add_command(plots.sentiment.bias_over_time) cli.add_command(plots.sentiment.bias_over_time)
cli.add_command(plots.sentiment.bias_vs_recent_winner) cli.add_command(plots.sentiment.bias_vs_recent_winner)
cli.add_command(plots.links.elbow)
cli.add_command(plots.links.link_pca_clusters)
cli.add_command(plots.classifier.pca_with_classes)
cli() cli()

View File

@ -1,6 +1,10 @@
import data.main import data.main
import data.scrape import data.scrape
import data.factcheck
import data.links
__all__ = [ __all__ = [
'main' 'main'
,'scrape' ,'scrape'
,'factcheck'
,'links'
] ]

171
src/data/factcheck.py Normal file
View File

@ -0,0 +1,171 @@
import requests
from lxml import etree
from bs4 import BeautifulSoup
import re
from io import BytesIO
import pandas as pd
from pathlib import Path
import os
import sys
import click
from data.main import connect, map_tld, paths
from random import randint
from time import sleep
from tqdm import tqdm
@click.command('mbfc:parse-index')
def parse_index():
parser = etree.HTMLParser()
publishers = []
for page in range(1, 54):
url = f"https://mediabiasfactcheck.com/filtered-search/?pg={page}"
print(f"downloading {url}", file=sys.stderr)
response = requests.get(url)
html = response.content
tree = etree.parse(BytesIO(html), parser)
rows = tree.xpath('//table[@class="mbfc-table"]/tbody/tr')
print(f"parsing {len(rows)} rows", file=sys.stderr)
for row in rows:
publisher = {}
link, bias, reporting, country, credibility, media_type, traffic, popularity = tuple(col for col in row.iterchildren())
link = link.xpath('./a')[0]
publisher['name'] = link.text
publisher['detail_url'] = link.get('href')
publisher['bias'] = bias.text
publisher['reporting'] = reporting.text
publisher['country'] = country.text
publisher['credibility'] = credibility.text
publisher['media_type'] = media_type.text
publisher['traffic'] = traffic.text
publisher['popularity'] = popularity.xpath('./span')[0].text
publishers.append(publisher)
df = pd.DataFrame(publishers)
save_to = paths('data') / 'mbfc_bias.csv'
df.to_csv(save_to, sep='|', index=False)
print(f"saved {len(df)}: {save_to}", file=sys.stderr)
@click.command("mbfc:schema")
def schema():
with connect() as db:
db.sql("""create schema mbfc""")
db.sql("""create or replace table mbfc.scrape (
url text
,scraped_at datetime default now()
)
""")
@click.command("mbfc:scrape")
def scrape():
df = pd.read_csv(paths('data') / 'mbfc_bias.csv', sep="|")
with connect() as db:
stats = db.query("""
select
count(1) filter(where s.url is not null) as elapsed
,count(1) filter(where s.url is null) as remaining
from df
left join mbfc.scrape s
on df.detail_url = s.url
""").fetchall()
df = db.query("""
select
detail_url as url
from df
where df.detail_url not in (
select
url
from mbfc.scrape
)
""").df()
print(f"{stats[0][0]} elapsed. {stats[0][1]} remaining.")
for url in df.url:
delay = randint(1,3)
save_as = paths('data') / 'mbfc' / (url.strip('/').split('/')[-1] + '.html')
print(f"downloading (delay: {delay}): {url}", file=sys.stderr)
sleep(delay)
try:
response = requests.get(url)
except Exception as e:
print(f"request failed: {url}", file=sys.stderr)
continue
with open(save_as, 'w') as f:
f.write(response.text)
with connect() as db:
db.execute("""insert into mbfc.scrape (url) values (?)""", [url])
print(f"saved: {save_as}", file=sys.stderr)
def load():
publishers = []
for i, page in enumerate(tqdm((paths('data') / 'mbfc').iterdir())):
publisher = {}
publisher['origin_url'] = f"https://mediabiasfactcheck.com/{page.stem}"
with page.open() as p:
tree = BeautifulSoup(p, 'html.parser')
for e in tree(string=re.compile(r'source:', re.IGNORECASE)):
e = e.parent
while e.name != 'p':
e = e.parent
l = e.find('a')
if l:
publisher['tld'] = l.get('href')
break
else:
breakpoint()
publishers.append(publisher)
df = pd.DataFrame(publishers)
df.to_csv(paths('data') / 'mbfc_publisher_url.csv', index=False, sep="|")
@click.command('mbfc:create-tables')
def create_tables():
pubs = pd.read_csv(paths('data') / 'mbfc_publishers.csv', sep='|')
urls = pd.read_csv(paths('data') / 'mbfc_publisher_url.csv', sep="|")
df = pubs.merge(urls, on='mbfc_url')
df['tld'] = df.tld.apply(map_tld)
df['ordinal'] = df.bias.apply(bias_label_to_int)
with connect() as db:
db.sql("""
CREATE OR REPLACE TABLE mbfc.publishers AS
SELECT
row_number() over() as id
,p.tld
,mode(p.name) as name
,mode(p.bias) as bias
,mode(p.ordinal) as ordinal
,mode(p.reporting) as reporting
,mode(p.country) as country
,mode(p.credibility) as credibility
,mode(p.media_type) as media_type
,mode(p.traffic) as traffic
,mode(p.popularity) as popularity
FROM df p
GROUP BY
p.tld
""")
with connect() as db:
raw_stories = db.sql("""
SELECT
*
FROM stories s
""").df()
stories['tld'] = stories.url.apply(map_tld)
with connect() as db:
db.sql("""
CREATE OR REPLACE TABLE mbfc.publisher_stories AS
SELECT
s.id as story_id
,p.id as publisher_id
FROM raw_stories s
JOIN mbfc.publishers p
ON p.tld = s.tld
""")

135
src/data/links.py Normal file
View File

@ -0,0 +1,135 @@
import click
from data.main import connect
import pandas as pd
@click.command('links:create-table')
def create_table():
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE link_edges AS
with cte as(
SELECT
s.publisher_id as parent_id
,r.publisher_id as child_id
,count(1) as links
FROM stories s
JOIN related_stories r
ON s.id = r.parent_id
group by
s.publisher_id
,r.publisher_id
)
SELECT
cte.parent_id
,cte.child_id
,cte.links as links
,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
,case when cte.links > 0 then 1 else 0 end as onehot
FROM cte
WHERE cte.child_id in (
SELECT
distinct parent_id
FROM cte
)
AND cte.parent_id in (
SELECT
distinct child_id
FROM cte
)
""")
db.query("""
SELECT
*
,count(1) over()
FROM link_edges e
limit 1
""")
print(f"created link_edges")
@click.command('links:create-pca')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_pca(source):
"""create 2D pca labels"""
from sklearn.decomposition import PCA
table_name = f"publisher_pca_{source}"
with connect() as db:
pub = db.query("""
SELECT
p.*
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON p.id = ps.publisher_id
""").df()
df = db.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM link_edges
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
svd = PCA(n_components=2)
svd_out = svd.fit_transform(pivot)
out = pivot.reset_index()[['parent_id']]
out['first'] = svd_out[:, 0]
out['second'] = svd_out[:, 1]
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
out.id as publisher_id
,out.first as first
,out.second as second
FROM out
""")
print(f"created {table_name}")
@click.command('links:create-clusters')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_clusters(source):
"""create link adj. matrix clusters table"""
from sklearn.cluster import KMeans
table_name = f"publisher_clusters_{source}"
with connect() as db:
df = db.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM link_edges
""").df()
pub = db.query("""
SELECT
p.*
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
k = 8
kmeans = KMeans(n_clusters=k, n_init="auto")
pred = kmeans.fit_predict(pivot)
out = pivot.reset_index()[['parent_id']]
out['label'] = pred
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
new_table = out[['id', 'label']]
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
n.id as publisher_id
,n.label as label
FROM new_table n
""")
print(f"created {table_name}")

View File

@ -2,6 +2,10 @@ import os
from pathlib import Path from pathlib import Path
import duckdb import duckdb
from enum import Enum from enum import Enum
from urllib.parse import urlparse
from tld import get_tld
from tld.utils import update_tld_names
import sys
class Data(str, Enum): class Data(str, Enum):
Titles = 'titles' Titles = 'titles'
@ -9,6 +13,16 @@ class Data(str, Enum):
def data_dir(): def data_dir():
return Path(os.environ['DATA_MINING_DATA_DIR']) return Path(os.environ['DATA_MINING_DATA_DIR'])
def paths(name='app'):
if 'app' in name:
return Path(os.environ['DATA_MINING_APP_DIR'])
if 'data' in name:
return Path(os.environ['DATA_MINING_DATA_DIR'])
if 'doc' in name:
return Path(os.environ['DATA_MINING_DOCS_DIR'])
if 'figure' in name:
return Path(os.environ['DATA_MINING_DOCS_DIR']) / 'figures'
def connect(): def connect():
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
# APP_DIR = Path(os.environ['DATA_MINING_APP_DIR']) # APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
@ -28,3 +42,66 @@ def from_db(t: Data):
limit 100 limit 100
""").df() """).df()
return table return table
def map_tld(x):
try:
res = get_tld(x, as_object=True)
return res.fld
except:
print(f"'{x}' is not valid.", file=sys.stderr)
return None
def ticklabels():
return [
'Left',
'Left-Center',
'Least Biased',
'Right-Center',
'Right',
]
def bias_label_to_int(rating:str, source: str = 'mbfc') -> int:
if source == 'mbfc':
mapping = {
'Left' : 0,
'Left-Center' : 1,
'Least Biased' : 2,
'Right-Center' : 3,
'Right' : 4,
}
else:
mapping = {
'left' : 0,
'left-center' : 1,
'center' : 2,
'right-center' : 3,
'right' : 4,
}
try:
return mapping[rating]
except:
print(f"no mapping for {rating}", file=sys.stderr)
return -1
def bias_int_to_label(class_id: int, source: str = 'mbfc') -> str:
if source == 'mbfc':
mapping = {
0 : 'Left',
1 : 'Left-Center',
2 : 'Least Biased',
3 : 'Right-Center',
4 : 'Right',
}
else:
mapping = {
0 : 'left',
1 : 'left-center',
2 : 'center',
3 : 'right-center',
4 : 'right',
}
try:
return mapping[class_id]
except:
print(f"no mapping for {class_id}", file=sys.stderr)
return -1

View File

@ -319,12 +319,6 @@ def another_norm():
""") """)
def map_tld(x):
try:
res = get_tld(x, as_object=True)
return res.fld
except:
return None
DB.sql(""" DB.sql("""
SELECT SELECT

47
src/data/selection.py Normal file
View File

@ -0,0 +1,47 @@
from data.main import connect
import pandas as pd
import numpy as np
def create_tables():
with connect() as db:
edges = db.query("""
select
*
from link_edges
""").df()
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
with connect() as db:
db.query("create schema top")
db.query("""
CREATE OR REPLACE TABLE top.publishers AS
SELECT
p.*
FROM publishers p
JOIN select_publishers s
ON s.publisher_id = p.id
""")
db.query("""
CREATE OR REPLACE TABLE top.stories AS
SELECT
s.*
FROM stories s
JOIN top.publishers p
ON s.publisher_id = p.id
WHERE year(s.published_at) >= 2006
AND year(s.published_at) < 2023
""")
db.query("""
CREATE OR REPLACE TABLE top.related_stories AS
SELECT
r.*
FROM top.stories s
JOIN related_stories r
ON s.id = r.parent_id
""")

View File

@ -1,10 +1,11 @@
import click
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from data import connect, data_dir from data.main import connect, paths
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
import click import pandas as pd
@click.option('-c', '--chunks', type=int, default=500, show_default=True) @click.option('-c', '--chunks', type=int, default=500, show_default=True)
@click.command("sentiment:extract") @click.command("sentiment:extract")
@ -67,20 +68,19 @@ def extract(chunks):
@click.command('sentiment:load') @click.command('sentiment:load')
def load(): def load():
DB = connect() sentiments = np.load(paths('data') / 'sentiment.npy')
sentiments = np.load(data_dir() / 'sentiment.npy') story_ids = np.load(paths('data') / 'sentiment_ids.npy')
story_ids = np.load(data_dir() / 'sentiment_ids.npy')
data = pd.DataFrame(story_ids, columns=['story_id']).reset_index() data = pd.DataFrame(story_ids, columns=['story_id']).reset_index()
data['sentiment_id'] = sentiments data['sentiment_id'] = sentiments
DB.query(""" with connect() as db:
CREATE OR REPLACE TABLE top.story_sentiments AS db.query("""
CREATE OR REPLACE TABLE story_sentiments AS
SELECT SELECT
data.story_id data.story_id
,data.sentiment_id as class_id ,data.sentiment_id as class_id
,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label ,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label
FROM data FROM data
JOIN top.stories s JOIN stories s
ON s.id = data.story_id ON s.id = data.story_id
""") """)
DB.close()

View File

@ -1,255 +0,0 @@
import click
from data.main import connect
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
@click.command('links:create-table')
def create_table():
table_name = "top.link_edges"
DB = connect()
DB.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
with cte as(
SELECT
s.publisher_id as parent_id
,r.publisher_id as child_id
,count(1) as links
FROM top.stories s
JOIN top.related_stories r
ON s.id = r.parent_id
group by
s.publisher_id
,r.publisher_id
)
SELECT
cte.parent_id
,cte.child_id
,cte.links as links
,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
,case when cte.links > 0 then 1 else 0 end as onehot
FROM cte
WHERE cte.child_id in (
SELECT
distinct parent_id
FROM cte
)
AND cte.parent_id in (
SELECT
distinct child_id
FROM cte
)
""")
DB.close()
DB = connect()
DB.query("""
SELECT
*
,-log10(links)
--distinct parent_id
FROM top.link_edges e
WHERE e.parent_id = 238
""")
DB.close()
print(f"created {table_name}")
@click.command('links:create-pca')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_pca(source):
"""create 2D pca labels"""
from sklearn.decomposition import PCA
table_name = f"top.publisher_pca_{source}"
DB = connect()
pub = DB.query("""
SELECT
*
FROM top.publishers
""").df()
df = DB.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM top.link_edges
""").df()
DB.close()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
svd = PCA(n_components=2)
svd_out = svd.fit_transform(pivot)
out = pivot.reset_index()[['parent_id']]
out['first'] = svd_out[:, 0]
out['second'] = svd_out[:, 1]
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
DB = connect()
DB.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
out.id as publisher_id
,out.first as first
,out.second as second
FROM out
""")
DB.close()
print(f"created {table_name}")
@click.command('links:create-clusters')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_clusters(source):
from sklearn.cluster import KMeans
table_name = f"top.publisher_clusters_{source}"
DB = connect()
df = DB.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM top.link_edges
""").df()
pub = DB.query("""
SELECT
*
FROM top.publishers
""").df()
DB.close()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
k = 8
kmeans = KMeans(n_clusters=k, n_init="auto")
pred = kmeans.fit_predict(pivot)
out = pivot.reset_index()[['parent_id']]
out['label'] = pred
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
new_table = out[['id', 'label']]
DB = connect()
DB.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
n.id as publisher_id
,n.label as label
FROM new_table n
""")
DB.close()
print(f"created {table_name}")
def to_matrix():
"""returns an adjacency matrix of publishers to publisher link frequency"""
DB = connect()
bias_map = pd.DataFrame([
{'label' :'left', 'value' : 0},
{'label' :'left-center', 'value' : 1},
{'label' :'center', 'value' : 2},
{'label' :'right-center', 'value' : 3},
{'label' :'right', 'value' : 4},
{'label' :'allsides', 'value' : -1},
])
bias = DB.sql("""
SELECT
b.id
,b.label
,m.value
FROM publisher_bias b
JOIN bias_map m
ON b.label = m.label
WHERE value != -1
""").df()
pub = DB.sql("""
select
p.id
,p.name
,p.url
from publishers p
""").df()
edges = DB.sql("""
WITH total as (
SELECT
s.publisher_id as id
,COUNT(1) as stories
FROM stories s
GROUP BY
s.publisher_id
), p as (
SELECT
p.id
,stories
FROM publishers p
LEFT JOIN total t
ON t.id = p.id
WHERE t.stories >= 20
), cte as (
SELECT
r.publisher_id as child_id
,s.publisher_id as parent_id
,count(1) as links
FROM related_stories r
JOIN stories s
ON s.id = r.parent_id
group by
s.publisher_id
,r.publisher_id
)
SELECT
p.id as parent_id
,cte.child_id
,links
FROM p
left JOIN cte
ON p.id = cte.parent_id
""").df()
# only keep values that have more than 1 link
test = edges[edges['links'] > 2].pivot(index='parent_id', columns='child_id', values='links').fillna(0).reset_index()
edges.dropna().pivot(index='parent_id', columns='child_id', values='links').fillna(0)
pd.merge(adj, pub, how='left', left_on='parent_id', right_on='id')
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
adj.values.shape
out = pd.DataFrame(adj.index.values, columns=['id'])
out = pd.merge(out, pub, how='left', on='id')
return out
@click.command('links:analysis')
def analysis():
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import MiniBatchKMeans
adj = to_matrix()
pca = PCA(n_components=4)
pca_out = pca.fit_transform(adj)
svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
svd_out = svd.fit_transform(adj)
x = svd_out[:, 0]
y = svd_out[:, 1]
x = pca_out[:, 0]
y = pca_out[:, 1]
sns.scatterplot(x=x, y=y)
plt.show()
kmeans = MiniBatchKMeans(n_clusters=5, random_state=0, batch_size=6, n_init="auto")
pred = kmeans.fit_predict(pca_out)
sns.scatterplot(x=x, y=y, hue=pred)
plt.show()
sns.scatterplot(x=pub['first'], y=pub['second'], hue=pub['bias'])
plt.show()

View File

@ -1,6 +1,5 @@
from data.main import data_dir, connect from data.main import connect, paths
import numpy as np import numpy as np
import sklearn
from sklearn.cluster import MiniBatchKMeans from sklearn.cluster import MiniBatchKMeans
import click import click
from pathlib import Path from pathlib import Path
@ -11,7 +10,7 @@ from enum import Enum, auto
@click.command(name="mine:embeddings") @click.command(name="mine:embeddings")
def embeddings(): def embeddings():
data = np.load(data_dir() / "embeddings.npy") data = np.load(paths('data') / "embeddings.npy")
kmeans = MiniBatchKMeans(n_clusters=5, kmeans = MiniBatchKMeans(n_clusters=5,
random_state=0, random_state=0,
batch_size=6, batch_size=6,
@ -76,7 +75,7 @@ class PlotName(str, Enum):
@click.option('-n', '--name', required=True, type=click.Choice(PlotName)) @click.option('-n', '--name', required=True, type=click.Choice(PlotName))
@click.option('-o', '--output', required=False, type=click.Path()) @click.option('-o', '--output', required=False, type=click.Path())
def plot(name: PlotName, output: Path): def plot(name: PlotName, output: Path):
output = output if output else APP_DIR / f'docs/{name}.png' output = output if output else paths('figures') / f'{name}.png'
if name == PlotName.TitleLength: if name == PlotName.TitleLength:
fig, ax = plt.subplots(1,1) fig, ax = plt.subplots(1,1)
data = db.sql(""" data = db.sql("""

36
src/mining/bias.py Normal file
View File

@ -0,0 +1,36 @@
from data.main import connect, map_tld
import os
from pathlib import Path
def normalize():
with connect() as db:
db.sql("""
SELECT
p.name
,count(1) as ctn
,sum(ctn) over() as all
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
JOIN stories s
ON s.id = ps.story_id
GROUP BY
p.name
""")
with connect() as db:
db.sql("""
SELECT
bias
,count(distinct p.id) as publishers
,count(1) as stories
,count(1) / count(distinct p.id) as ratio
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
JOIN stories s
ON s.id = ps.story_id
GROUP BY
p.bias
ORDER BY count(1)
""")

View File

@ -1,9 +1,13 @@
import plots.sentence import plots.sentence
import plots.emotion import plots.emotion
import plots.sentiment import plots.sentiment
import plots.links
import plots.classifier
__all__ = [ __all__ = [
'sentence' 'sentence'
'emotion', 'emotion',
'sentiment', 'sentiment',
'links',
'classifier',
] ]

View File

@ -1,5 +1,5 @@
import click import click
from data.main import connect from data.main import connect, bias_label_to_int, ticklabels
import os import os
from pathlib import Path from pathlib import Path
import seaborn as sns import seaborn as sns
@ -7,54 +7,53 @@ import matplotlib.pyplot as plt
import numpy as np import numpy as np
import pandas as pd import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:bias-hist') @click.command('plot:bias-hist')
def hist(): def hist():
filename = "bias_hist.png" save_to = paths('figures') / "bias_hist.png"
DB = connect() with connect() as db:
data = DB.sql(""" data = db.sql("""
SELECT SELECT
b.ordinal p.ordinal
,count(1) as stories ,count(1) as stories
FROM stories s FROM stories s
JOIN publisher_bias pb JOIN mbfc.publisher_stories ps
ON pb.publisher_id = s.publisher_id ON s.id = ps.story_id
JOIN bias_ratings b JOIN mbfc.publishers p
ON b.id = pb.bias_id ON ps.publisher_id = p.id
WHERE ordinal != -1
GROUP BY GROUP BY
b.ordinal p.ordinal
""").df() """).df()
DB.close()
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue') ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
plt.tight_layout() plt.tight_layout()
plt.savefig(out_path / filename) plt.savefig(save_to)
print(f"saved: {filename}") plt.close()
print(f"saved: {save_to}")
@click.command('plot:bias-publisher-hist') @click.command('plot:bias-publisher-hist')
def publisher_hist(): def publisher_hist():
filename = "bias_publisher_hist.png" save_to = paths('figures') / "bias_publisher_hist.png"
DB = connect() with connect() as db:
data = DB.sql(""" data = db.sql("""
SELECT SELECT
b.ordinal p.ordinal
,count(1) as publishers ,count(distinct p.id) as publishers
FROM publisher_bias pb FROM mbfc.publishers p
JOIN bias_ratings b JOIN mbfc.publisher_stories ps
ON b.id = pb.bias_id ON ps.publisher_id = p.id
WHERE ordinal != -1
GROUP BY GROUP BY
b.ordinal p.ordinal
""").df() """).df()
DB.close()
ax = sns.barplot(x=data['ordinal'], y=data['publishers'], color='tab:blue') ax = sns.barplot(x=data['ordinal'], y=data['publishers'], color='tab:blue')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels())
ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels)
plt.tight_layout() plt.tight_layout()
plt.savefig(out_path / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved: {filename}") print(f"saved: {save_to}")

View File

@ -5,30 +5,32 @@ import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from pathlib import Path from pathlib import Path
out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:pca-with-classes') @click.command('plot:pca-with-classes')
def pca_with_classes(): @click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
filename = "pca_with_classes.png" def pca_with_classes(source):
DB = connect() save_to = paths('figures') / f"link_{source}_pca_with_classes.png"
data = DB.query(f"""
with connect() as db:
df = db.query(f"""
SELECT SELECT
p.tld p.tld
,b.bias ,p.bias
,c.first ,c.first
,c.second ,c.second
,round(cast(b.agree as float) / (b.agree + b.disagree), 2) ratio --,round(cast(b.agree as float) / (b.agree + b.disagree), 2) ratio
FROM top.publishers p FROM mbfc.publishers p
JOIN top.publisher_bias pb JOIN publisher_pca_{source} c
ON p.id = pb.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
JOIN top.publisher_pca_normalized c
ON c.publisher_id = p.id ON c.publisher_id = p.id
WHERE p.ordinal != -1
ORDER BY p.ordinal
""").df() """).df()
DB.close()
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['bias'], s=100) ax = sns.relplot(df, x='first', y='second', hue='bias', col='bias', s=100, palette='rainbow')
ax.set(title="pca components vs. bias labels", xlabel="first pca component", ylabel="second pca component") ax.set(xlabel="first pca component",
plt.savefig(out_dir / filename) ylabel="second pca component")
print(f"saved: {filename}") ax.figure.suptitle="pca components vs. bias labels"
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
os.system(f'xdg-open {save_to}')

View File

@ -1,18 +1,16 @@
import click import click
from data.main import connect from data.main import connect, paths
import os import os
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:articles-per-year') @click.command('plot:articles-per-year')
def articles_per_year(): def articles_per_year():
filename = 'articles_per_year.png' save_to = paths('figures') / 'articles_per_year.png'
DB = connect() with connect() as db:
data = DB.query(""" data = DB.query("""
select select
year(published_at) as year year(published_at) as year
@ -21,19 +19,19 @@ def articles_per_year():
group by group by
year(published_at) year(published_at)
""").df() """).df()
DB.close()
ax = sns.barplot(x=data.year, y=data.stories, color='tab:blue') ax = sns.barplot(x=data.year, y=data.stories, color='tab:blue')
ax.tick_params(axis='x', rotation=90) ax.tick_params(axis='x', rotation=90)
ax.set(title="count of articles per year", ylabel="count of stories (#)") ax.set(title="count of articles per year", ylabel="count of stories (#)")
plt.tight_layout() plt.tight_layout()
plt.savefig(out_dir / filename) plt.savefig(save_to)
print(f"saved: {save_to}")
@click.command('plot:distinct-publishers') @click.command('plot:distinct-publishers')
def distinct_publishers(): def distinct_publishers():
filename = 'distinct_publishers.png' save_to = paths('figures') / 'distinct_publishers.png'
DB = connect() with connect() as db:
data = DB.query(""" data = DB.query("""
select select
year(published_at) as year year(published_at) as year
@ -42,30 +40,32 @@ def distinct_publishers():
group by group by
year(published_at) year(published_at)
""").df() """).df()
DB.close()
ax = sns.barplot(x=data.year, y=data.publishers, color='tab:blue') ax = sns.barplot(x=data.year, y=data.publishers, color='tab:blue')
ax.tick_params(axis='x', rotation=90) ax.tick_params(axis='x', rotation=90)
ax.set(title="count of publishers per year", ylabel="count of publishers (#)") ax.set(title="count of publishers per year", ylabel="count of publishers (#)")
plt.tight_layout() plt.tight_layout()
plt.savefig(out_dir / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved: {save_to}")
@click.command('plot:stories-per-publisher') @click.command('plot:stories-per-publisher')
def stories_per_publisher(): def stories_per_publisher():
filename = 'stories_per_publisher.png' save_to = paths('figures') / 'stories_per_publisher.png'
DB = connect() with connect() as db:
data = DB.query(""" data = db.query("""
with cte as ( with cte as (
select select
publisher_id ps.publisher_id
,year(published_at) as year ,year(s.published_at) as year
,count(1) as stories ,count(1) as stories
from stories from stories s
join mbfc.publisher_stories ps
on ps.story_id = s.id
group by group by
publisher_id ps.publisher_id
,year(published_at) ,year(s.published_at)
) , agg as ( ) , agg as (
select select
publisher_id publisher_id
@ -91,64 +91,86 @@ def stories_per_publisher():
group by group by
max_avg max_avg
""").df() """).df()
DB.close()
ax = sns.barplot(x=data.max_avg, y=data.publishers, color='tab:blue') ax = sns.barplot(x=data.max_avg, y=data.publishers, color='tab:blue')
ax.set(title="histogram of publisher stories per year", ylabel="count of publishers (#)", xlabel="max average stories / year") ax.set(title="histogram of publisher stories per year", ylabel="count of publishers (#)", xlabel="avg. stories / year", xticklabels=['2', '4', '8', '16', '32', '64', '128', '>128'])
plt.tight_layout() plt.tight_layout()
plt.savefig(out_dir / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved: {save_to}")
@click.command('plot:top-publishers') @click.command('plot:top-publishers')
def top_publishers(): def top_publishers():
"""plot top publishers over time""" """plot top publishers over time"""
filename = 'top_publishers.png' save_to = paths('figures') / 'top_publishers.png'
DB = connect() with connect() as db:
data = DB.query(""" db.query("""
select SELECT
p.tld
,year(published_at) as year
,count(1) as stories
from (
select
p.tld p.tld
,p.id ,p.id
from top.publishers p FROM mbfc.publishers p
join top.stories s JOIN mbfc.publisher_stories ps
on s.publisher_id = p.id ON ps.publisher_id = p.id
group by JOIN stories s
ON s.id = ps.story_id
GROUP BY
p.tld p.tld
,p.id ,p.id
order by count(1) desc order by count(1) desc
limit 20 limit 20
) p """)
join top.stories s
on s.publisher_id = p.id with connect() as db:
group by data = db.query("""
WITH p as (
SELECT
p.tld p.tld
,year(published_at) ,p.id
order by count(distinct s.id) desc FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
JOIN stories s
ON s.id = ps.story_id
GROUP BY
p.tld
,p.id
order by count(1) desc
limit 20
)
SELECT
p.tld
,YEAR(s.published_at) AS year
,COUNT(1) AS stories
FROM stories s
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN p
ON p.id = ps.publisher_id
GROUP by
p.tld
,YEAR(published_at)
ORDER BY year, COUNT(DISTINCT s.id) DESC
""").df() """).df()
DB.close()
pivot = data.pivot(columns='year', index='tld', values='stories') pivot = data.pivot(columns='year', index='tld', values='stories')
ax = sns.heatmap(pivot, cmap="crest") ax = sns.heatmap(pivot, cmap="crest")
ax.set(title="top 20 publishers (by tld)", ylabel="tld", xlabel="stories / year (#)") ax.set(title="top 20 publishers (by tld)", ylabel="tld", xlabel="stories / year (#)")
plt.tight_layout() plt.tight_layout()
plt.savefig(out_dir / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved: {save_to}")
@click.command('plot:common_tld') @click.command('plot:common_tld')
def common_tld(): def common_tld():
import dataframe_image as dfi import dataframe_image as dfi
filename = 'common_tld.png' save_to = paths('figures') / 'common_tld.png'
DB = connect() with connect() as db:
data = DB.query(""" data = db.query("""
select select
split_part(url, '.', -1) as tld split_part(url, '.', -1) as tld
,count(1) as publishers ,count(1) as publishers
@ -162,8 +184,7 @@ def common_tld():
order by order by
count(1) desc count(1) desc
""").df() """).df()
DB.close() data[:15][['tld', 'publishers']].style.hide(axis="index").export_png(save_to, table_conversion='matplotlib')
data[:15][['tld', 'publishers']].style.hide(axis="index").export_png(out_dir / filename, table_conversion='matplotlib')
def stats(): def stats():
@ -246,7 +267,7 @@ def stats():
@click.command('plot:bias-stats') @click.command('plot:bias-stats')
def bias_stats(): def bias_stats():
import dataframe_image as dfi import dataframe_image as dfi
filename = 'bias_stats.png' save_to = paths('figures') / 'bias_stats.png'
DB = connect() DB = connect()
@ -300,3 +321,69 @@ def bias_stats():
""").df() """).df()
DB.close() DB.close()
print(df.to_markdown(index=False)) print(df.to_markdown(index=False))
@click.command('plot:bias-over-time')
def bias_over_time():
"""plot bias labels over time"""
save_to = paths('figures') / 'bias_over_time.png'
with connect() as db:
df = db.sql("""
SELECT
p.bias
,p.id
,date_trunc('year', s.published_at) as year
,count(1) as stories
FROM stories s
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
where year(s.published_at) not in (2006, 2023)
and p.ordinal != -1
GROUP BY
p.bias
,p.id
,p.ordinal
,date_trunc('year', s.published_at)
order by
p.ordinal
,date_trunc('year', s.published_at)
""").df()
ax = sns.relplot(df, kind='line', x='year', y='stories', col='bias', units='id', estimator=None, palette='rainbow')
ax.set(ylabel="stories", xlabel="year")
plt.tight_layout()
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
def bias_missing():
with connect() as db:
df = db.sql("""
SELECT
date_trunc('year', s.published_at) as year
,s.tld
,count(1) as stories
FROM stories s
LEFT JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
WHERE ps.publisher_id is NULL
AND year(s.published_at) not in (2006, 2023)
GROUP BY
s.tld
,date_trunc('year', s.published_at)
HAVING count(1) > 10
ORDER BY
date_trunc('year', s.published_at)
""").df()
ax = sns.lineplot(df, x='year', y='stories', units='tld', estimator=None)
ax.set(ylabel="stories", xlabel="year")
plt.tight_layout()
plt.show()
#plt.savefig(save_to)
plt.close()
#print(f"saved: {save_to}")

View File

@ -1,25 +1,24 @@
import click import click
from data.main import connect from data.main import connect, paths, ticklabels
import os import os
from pathlib import Path
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
import pandas as pd import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:emotion-over-time') @click.command('plot:emotion-over-time')
def emotion_over_time(): def emotion_over_time():
filename = "emotion_over_time.png"
DB = connect()
emotions = DB.sql(""" filename = "emotion_over_time.png"
save_to = paths('figures') / filename
with connect() as db:
emotions = db.sql("""
SELECT SELECT
date_trunc('year', s.published_at) AS year date_trunc('year', s.published_at) AS year
,e.label AS emotion ,e.label AS emotion
,count(1) AS stories ,count(1) AS stories
FROM top.stories s FROM stories s
JOIN story_emotions se JOIN story_emotions se
ON s.id = se.story_id ON s.id = se.story_id
JOIN emotions e JOIN emotions e
@ -28,50 +27,53 @@ def emotion_over_time():
date_trunc('year', s.published_at) date_trunc('year', s.published_at)
,e.label ,e.label
""").df() """).df()
DB.close()
ax = sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion']) ax = sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
ax.set(title="title emotions over years", xlabel="year", ylabel="stories (#)") ax.set(title="title emotions over years", xlabel="year", ylabel="stories (#)")
plt.savefig(out_path / filename) plt.savefig(save_to)
print(f"saved: {filename}") plt.close()
print(f"saved: {save_to}")
os.system(f'xdg-open {save_to}')
@click.command('plot:emotion-regression') @click.command('plot:emotion-regression')
def emotion_regression(): def emotion_regression():
"""plot emotion over time as regression"""
from sklearn import linear_model from sklearn import linear_model
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay from sklearn.metrics import ConfusionMatrixDisplay
filename = "emotion_regression.png" filename = "emotion_regression.png"
save_to = paths('figures') / filename
DB = connect() with connect() as db:
emotions = DB.query(""" #emotions = db.query("""
SELECT # SELECT
label # label
FROM emotions e # FROM emotions e
""").df()['label'].to_list() #""").df()['label'].to_list()
DB.close() df = db.sql(f"""
DB = connect()
df = DB.sql(f"""
SELECT SELECT
epoch(date_trunc('yearweek', s.published_at)) AS date epoch(date_trunc('yearweek', s.published_at)) AS date
,e.id AS emotion_id ,e.id AS emotion_id
,p.id as publisher_id ,p.id as publisher_id
,count(1) AS stories ,count(1) AS stories
FROM top.stories s FROM stories s
JOIN top.publishers p JOIN mbfc.publisher_stories ps
ON p.id = s.publisher_id ON ps.story_id = s.id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
JOIN story_emotions se JOIN story_emotions se
ON s.id = se.story_id ON s.id = se.story_id
JOIN emotions e JOIN emotions e
ON e.id = se.emotion_id ON e.id = se.emotion_id
WHERE p.ordinal != -1
GROUP by GROUP by
epoch(date_trunc('yearweek', s.published_at)) epoch(date_trunc('yearweek', s.published_at))
,p.id ,p.id
,e.id ,e.id
""").df() """).df()
DB.close()
results = [] results = []
for (emotion_id, publisher_id), group in df.groupby(['emotion_id', 'publisher_id']): for (emotion_id, publisher_id), group in df.groupby(['emotion_id', 'publisher_id']):
@ -83,77 +85,59 @@ def emotion_regression():
results.append({'emotion_id' : emotion_id, 'publisher_id':publisher_id, 'per_year' : per_year}) results.append({'emotion_id' : emotion_id, 'publisher_id':publisher_id, 'per_year' : per_year})
results = pd.DataFrame(results) results = pd.DataFrame(results)
DB = connect() with connect() as db:
out = DB.query(""" out = db.query("""
SELECT SELECT
e.label as emotion e.label as emotion
--,p.tld
,avg(results.per_year) as avg_reg_coef ,avg(results.per_year) as avg_reg_coef
,b.ordinal ,p.bias
FROM results FROM results
JOIN emotions e JOIN emotions e
ON e.id = results.emotion_id ON e.id = results.emotion_id
JOIN top.publishers p JOIN mbfc.publishers p
ON p.id = results.publisher_id ON p.id = results.publisher_id
JOIN publisher_bias pb
ON pb.publisher_id = results.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
GROUP BY GROUP BY
e.label e.label
,b.ordinal ,p.bias
""").df() """).df()
DB.close()
pivot = out.pivot(index=['emotion'], columns=['ordinal'], values=['avg_reg_coef'])
ax = sns.heatmap(pivot, cmap='RdBu_r') pivot = out.pivot(index=['emotion'], columns=['bias'], values=['avg_reg_coef'])
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax = sns.heatmap(pivot, cmap='BrBG', vmin=-0.01, vmax=0.01, center=0)
#ax = sns.heatmap(pivot, cmap='RdBu_r', center=0)
ax.set(title="slope of regression (stories/year) by bias and emotion" ax.set(title="slope of regression (stories/year) by bias and emotion"
,xticklabels=ticklabels ,xticklabels=ticklabels()
,xlabel="bias" ,xlabel="bias"
,ylabel="emotion") ,ylabel="emotion")
plt.tight_layout() plt.tight_layout()
plt.savefig(out_path / filename) plt.savefig(save_to)
print(f"saved: {filename}") plt.close()
print(f"saved: {save_to}")
@click.command('plot:emotion-hist') @click.command('plot:emotion-hist')
def emotion_hist(): def emotion_hist():
filename = "emotion_hist.png" filename = "emotion_hist.png"
save_to = paths('figures') / filename
DB = connect() with connect() as db:
DB.query("""describe story_emotions""") data = db.sql("""
DB.query("""
select
e.label
,count(distinct s.id) as stories
,count(distinct s.publisher_id) as publishers
from story_emotions se
join emotions e
on e.id = se.emotion_id
join top.stories s
on s.id = se.story_id
group by
e.label
""").df().to_markdown(index=False)
data = DB.sql("""
SELECT SELECT
b.ordinal p.bias
,count(1) as stories ,count(1) as stories
FROM stories s FROM stories s
JOIN publisher_bias pb JOIN mbfc.publisher_stories ps
ON pb.publisher_id = s.publisher_id ON ps.story_id = s.id
JOIN bias_ratings b JOIN mbfc.publishers p
ON b.id = pb.bias_id ON p.id = ps.publisher_id
WHERE p.ordinal != -1
GROUP BY GROUP BY
b.ordinal p.bias
""").df() """).df()
DB.close()
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue') ax = sns.barplot(data, x='bias', y='stories', palette='rainbow', order=ticklabels())
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
plt.tight_layout() plt.tight_layout()
plt.savefig(out_path / filename) plt.savefig(save_to)
print(f"saved: {filename}") plt.close()
print(f"saved: {save_to}")

View File

@ -9,20 +9,20 @@ import numpy as np
from sklearn.metrics import silhouette_score from sklearn.metrics import silhouette_score
import pandas as pd import pandas as pd
out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:link-elbow') @click.command('plot:link-elbow')
def elbow(): def elbow():
from sklearn.cluster import KMeans from sklearn.cluster import KMeans
filename = 'link_cluster_elbow.png' save_to = paths('figures') / 'link_cluster_elbow.png'
DB = connect() with connect() as db:
df = DB.query(""" df = db.query("""
SELECT SELECT
* *
FROM link_edges FROM link_edges
""").df() """).df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0) pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
to_plot = [] to_plot = []
@ -36,8 +36,9 @@ def elbow():
ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia) ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia)
ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points") ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
plt.savefig(out_dir / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved plot: {save_to}")
# randomly pick 8 # randomly pick 8
@ -45,10 +46,10 @@ def elbow():
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links') @click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def link_pca_clusters(source): def link_pca_clusters(source):
filename = f"link_pca_clusters_{source}.png" save_to = paths('figures') / f"link_pca_clusters_{source}.png"
DB = connect() with connect() as db:
df = DB.query(f""" df = db.query(f"""
SELECT SELECT
c.label as cluster c.label as cluster
,p.tld ,p.tld
@ -72,21 +73,17 @@ def link_pca_clusters(source):
JOIN top.publisher_pca_{source} pca JOIN top.publisher_pca_{source} pca
ON pca.publisher_id = p.id ON pca.publisher_id = p.id
""").df() """).df()
DB.close()
ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster']) ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster'])
ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component") ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component")
plt.savefig(out_dir / filename) plt.savefig(save_to)
print(f"saved plot: {save_to}")
# .df().groupby(['cluster', 'bias']).describe()
def test(): def test():
data_dir = Path(os.getenv('DATA_MINING_DATA_DIR'))
DB.query(""" with connect() as db:
db.query("""
SELECT SELECT
p.id as publisher_id p.id as publisher_id
,p.name ,p.name
@ -109,9 +106,6 @@ def test():
ORDER BY count(1) desc ORDER BY count(1) desc
""") """)
# .df().to_csv(data_dir / 'cluster_publishers.csv', sep="|", index=False)
DB.close()
@click.command('plot:link-confusion') @click.command('plot:link-confusion')
def link_confusion(): def link_confusion():
@ -120,9 +114,10 @@ def link_confusion():
from sklearn.metrics import ConfusionMatrixDisplay from sklearn.metrics import ConfusionMatrixDisplay
filename = "link_confusion.png" filename = "link_confusion.png"
save_to = paths('figures') / filename
DB = connect() with connect() as db:
bias = DB.query(""" bias = db.query("""
SELECT SELECT
p.id as publisher_id p.id as publisher_id
,b.ordinal ,b.ordinal
@ -133,7 +128,7 @@ def link_confusion():
ON b.id = pb.bias_id ON b.id = pb.bias_id
""").df() """).df()
df = DB.query(""" df = db.query("""
SELECT SELECT
* *
FROM top.link_edges FROM top.link_edges
@ -148,6 +143,7 @@ def link_confusion():
from bias from bias
) )
""").df() """).df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0) pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
x = pivot.values x = pivot.values
@ -166,9 +162,9 @@ def link_confusion():
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax) ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels) ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
plt.savefig(out_dir / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved plot: {filename}") print(f"saved plot: {save_to}")
@click.command('plot:link-classifier') @click.command('plot:link-classifier')
def link_confusion(): def link_confusion():
@ -176,10 +172,10 @@ def link_confusion():
from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay from sklearn.metrics import ConfusionMatrixDisplay
filename = "link_confusion.png" save_to = paths('figures') / "link_confusion.png"
DB = connect() with connect() as db:
bias = DB.query(""" bias = db.query("""
SELECT SELECT
p.id as publisher_id p.id as publisher_id
,b.ordinal ,b.ordinal
@ -190,7 +186,7 @@ def link_confusion():
ON b.id = pb.bias_id ON b.id = pb.bias_id
""").df() """).df()
df = DB.query(""" df = db.query("""
SELECT SELECT
* *
FROM top.link_edges FROM top.link_edges
@ -205,12 +201,14 @@ def link_confusion():
from bias from bias
) )
""").df() """).df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0) pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
x = pivot.values x = pivot.values
y = bias.sort_values('publisher_id').ordinal y = bias.sort_values('publisher_id').ordinal
data = DB.query(f""" with connect() as db:
data = db.query(f"""
SELECT SELECT
p.id as publisher_id p.id as publisher_id
,pca.first ,pca.first
@ -235,11 +233,11 @@ def link_confusion():
ConfusionMatrixDisplay.from_predictions(data['ordinal'], data['pred'], ax=ax) ConfusionMatrixDisplay.from_predictions(data['ordinal'], data['pred'], ax=ax)
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="confusion matrix for link matrix kNN classifier", xticklabels=ticklabels, yticklabels=ticklabels) ax.set(title="confusion matrix for link matrix kNN classifier", xticklabels=ticklabels, yticklabels=ticklabels)
plt.savefig(out_dir / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved plot: {filename}") print(f"saved plot: {save_to}")
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred']) # ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
plt.savefig(out_dir / filename) # plt.savefig(out_dir / filename)
plt.close() # plt.close()
print(f"saved plot: {filename}") # print(f"saved plot: {filename}")

View File

@ -1,5 +1,5 @@
import click import click
from data.main import connect from data.main import connect, paths
import os import os
from pathlib import Path from pathlib import Path
import seaborn as sns import seaborn as sns
@ -7,15 +7,12 @@ import matplotlib.pyplot as plt
import numpy as np import numpy as np
import pandas as pd import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
@click.command('plot:sentence-pca') @click.command('plot:sentence-pca')
def sentence_pca(): def sentence_pca():
filename = "embedding_sentence_pca.png" save_to = paths('figures') / "embedding_sentence_pca.png"
DB = connect()
data = DB.query(""" with connect() as db:
data = db.query("""
SELECT SELECT
pca.first pca.first
,pca.second ,pca.second
@ -28,18 +25,17 @@ def sentence_pca():
JOIN bias_ratings b JOIN bias_ratings b
ON b.id = pb.bias_id ON b.id = pb.bias_id
""").df() """).df()
DB.close()
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label']) ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
ax.set(title="pca components vs. bias label", xlabel="first component", ylabel="second component") ax.set(title="pca components vs. bias label", xlabel="first component", ylabel="second component")
plt.savefig(out_path / filename) plt.savefig(save_to)
@click.command('plot:avg-sentence-pca') @click.command('plot:avg-sentence-pca')
def avg_sentence_pca(): def avg_sentence_pca():
filename = "avg_embedding_sentence_pca.png" save_to = paths('figures') / "avg_embedding_sentence_pca.png"
DB = connect()
data = DB.query(""" with connect() as db:
data = db.query("""
SELECT SELECT
pca.first pca.first
,pca.second ,pca.second
@ -53,11 +49,10 @@ def avg_sentence_pca():
JOIN bias_ratings b JOIN bias_ratings b
ON b.id = pb.bias_id ON b.id = pb.bias_id
""").df() """).df()
DB.close()
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label']) ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
ax.set(title="avg. publisher embedding pca components vs. bias label", xlabel="first component", ylabel="second component") ax.set(title="avg. publisher embedding pca components vs. bias label", xlabel="first component", ylabel="second component")
plt.savefig(out_path / filename) plt.savefig(save_to)
@click.command('plot:sentence-confusion') @click.command('plot:sentence-confusion')
def sentence_confusion(): def sentence_confusion():
@ -65,14 +60,14 @@ def sentence_confusion():
from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay from sklearn.metrics import ConfusionMatrixDisplay
filename = "sentence_confusion.png" save_to = paths('figures') / "sentence_confusion.png"
embeddings = np.load(data_path / 'embeddings.npy') embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = np.load(data_path / 'embedding_ids.npy') embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index() ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
DB = connect() with connect() as db:
data = DB.query(""" data = db.query("""
SELECT SELECT
ids.index ids.index
,s.id ,s.id
@ -85,12 +80,11 @@ def sentence_confusion():
JOIN bias_ratings b JOIN bias_ratings b
ON b.id = pb.bias_id ON b.id = pb.bias_id
""").df() """).df()
pub = DB.query(""" pub = db.query("""
SELECT SELECT
* *
FROM top.publishers FROM top.publishers
""").df() """).df()
DB.close()
train, test = train_test_split(data) train, test = train_test_split(data)
train_x, train_y = embeddings[train['index']], train['ordinal'] train_x, train_y = embeddings[train['index']], train['ordinal']
@ -105,7 +99,7 @@ def sentence_confusion():
ConfusionMatrixDisplay.from_predictions(test_y, pred, ax=ax) ConfusionMatrixDisplay.from_predictions(test_y, pred, ax=ax)
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels) ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
plt.savefig(out_path / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved plot: {filename}") print(f"saved plot: {save_to}")

View File

@ -1,20 +1,16 @@
import click import click
from data.main import connect from data.main import connect, paths, ticklabels
import os
from pathlib import Path
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:sentiment-over-time') @click.command('plot:sentiment-over-time')
def over_time(): def over_time():
filename = "sentiment_over_time.png"
DB = connect() filename = "sentiment_over_time.png"
data = DB.sql(""" save_to = paths('figures') / filename
with connect() as db:
data = db.sql("""
SELECT SELECT
avg(sent.class_id) as sentiment avg(sent.class_id) as sentiment
,s.published_at as date ,s.published_at as date
@ -24,115 +20,116 @@ def over_time():
GROUP BY GROUP BY
s.published_at s.published_at
""").df() """).df()
DB.close()
ax = sns.scatterplot(x=data['date'], y=data['sentiment']) ax = sns.scatterplot(x=data['date'], y=data['sentiment'])
ax.set(title="sentiment vs. time") ax.set(title="sentiment vs. time")
plt.tight_layout() plt.tight_layout()
plt.savefig(out_path / filename) plt.savefig(save_to)
print(f"saved: {filename}") plt.close()
print(f"saved: {save_to}")
@click.command('plot:bias-vs-sentiment-over-time') @click.command('plot:bias-vs-sentiment-over-time')
def bias_over_time(): def bias_over_time():
filename = "bias_vs_sentiment_over_time.png" """plot sentiment/bias vs. time"""
DB = connect() filename = "bias_vs_sentiment_over_time.png"
data = DB.sql(""" save_to = paths('figures') / filename
with connect() as db:
data = db.sql("""
with cte as (
SELECT SELECT
avg(sent.class_id) as sentiment avg(sent.class_id) as sentiment
,date_trunc('yearweek', s.published_at) as date ,date_trunc('yearweek', s.published_at) as date
--,b.ordinal as ordinal ,p.bias
,b.bias FROM story_sentiments sent
FROM top.story_sentiments sent JOIN stories s
JOIN top.stories s
ON s.id = sent.story_id ON s.id = sent.story_id
JOIN publisher_bias pb JOIN mbfc.publisher_stories ps
ON pb.publisher_id = s.publisher_id ON ps.story_id = s.id
JOIN bias_ratings b JOIN mbfc.publishers p
ON b.id = pb.bias_id ON p.id = ps.publisher_id
WHERE p.ordinal != -1
GROUP BY GROUP BY
date_trunc('yearweek', s.published_at) date_trunc('yearweek', s.published_at)
,b.bias ,p.bias
)
SELECT
median(sentiment) OVER (PARTITION BY bias ORDER BY date DESC ROWS BETWEEN 0 PRECEDING AND 7 FOLLOWING) as sentiment
,date
,bias
FROM cte
WHERE year(date) not in (2005, 2023)
""").df() """).df()
DB.close()
order = ['left', 'left-center', 'center', 'right-center', 'right'] #ax = sns.relplot(data, x='date', y='sentiment', col='bias', palette='rainbow', hue='bias', col_order=ticklabels())
ax = sns.relplot(data, x='date', y='sentiment', col='bias', col_order=order) ax = sns.lineplot(data, x='date', y='sentiment', palette='rainbow', hue='bias', hue_order=ticklabels())
plt.axhline(y=0.5, color='black', linestyle='--', label='neutral')
ax.set(title='sentiment and bias vs. time', ylabel='8 week rolling avg. sentiment', xlabel='date')
plt.tight_layout() plt.tight_layout()
plt.savefig(out_path / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved: {filename}") print(f"saved: {save_to}")
@click.command('plot:sentiment-recent-winner') @click.command('plot:sentiment-recent-winner')
def bias_vs_recent_winner(): def bias_vs_recent_winner():
filename = "bias_vs_recent_winner.png" """plot bias vs. distance to election"""
DB = connect() filename = "bias_vs_recent_winner.png"
data = DB.sql(""" save_to = paths('figures') / filename
with connect() as db:
data = db.sql("""
SELECT SELECT
e.days_away as days_away round(e.days_away, -1) as days_away
,b.ordinal ,p.bias
,avg(sent.class_id) as sentiment ,avg(sent.class_id) as sentiment
,count(1) as stories ,count(1) as stories
FROM top.stories s FROM stories s
JOIN top.story_sentiments sent JOIN story_sentiments sent
ON s.id = sent.story_id ON s.id = sent.story_id
JOIN election_distance e JOIN election_distance e
ON e.publish_date = s.published_at ON e.publish_date = s.published_at
JOIN publisher_bias pb JOIN mbfc.publisher_stories ps
ON pb.publisher_id = s.publisher_id ON ps.story_id = s.id
JOIN bias_ratings b JOIN mbfc.publishers p
ON b.id = pb.bias_id ON p.id = ps.publisher_id
GROUP BY GROUP BY
e.days_away round(e.days_away, -1)
,b.ordinal ,p.bias
""").df() """).df()
DB.close()
data
ax = sns.scatterplot(x=data['days_away'], y=data['sentiment'], hue=data['ordinal']) ax = sns.scatterplot(data, x='days_away', y='sentiment', hue='bias', hue_order=ticklabels(), palette='rainbow')
ax.set(title="sentiment vs. days to nearest election", xlabel="days to nearest election", ylabel="average title seniment") ax.set(title="sentiment vs. days to nearest election", xlabel="days to nearest election", ylabel="average title seniment")
plt.tight_layout() plt.tight_layout()
plt.savefig(out_path / filename) plt.savefig(save_to)
plt.close() plt.close()
print(f"saved: {save_to}")
print(f"saved: {filename}")
@click.command('plot:sentiment-hist') @click.command('plot:sentiment-hist')
def sentiment_hist(): def sentiment_hist():
filename = "sentiment_hist.png" filename = "sentiment_hist.png"
save_to = paths('figures') / filename
DB = connect() with connect() as db:
data = db.sql("""
DB.query("""
select
sent.label
,count(distinct s.id) as stories
,count(distinct s.publisher_id) as publishers
from top.story_sentiments sent
join top.stories s
on s.id = sent.story_id
group by
sent.label
""").df().to_markdown(index=False)
data = DB.sql("""
SELECT SELECT
b.ordinal p.bias
,count(1) as stories ,count(1) as stories
FROM stories s FROM stories s
JOIN publisher_bias pb JOIN mbfc.publisher_stories ps
ON pb.publisher_id = s.publisher_id ON ps.story_id = s.id
JOIN bias_ratings b JOIN mbfc.publishers p
ON b.id = pb.bias_id ON p.id = ps.publisher_id
WHERE p.ordinal != -1
GROUP BY GROUP BY
b.ordinal p.bias
""").df() """).df()
DB.close()
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue') ax = sns.barplot(data, x='bias', y='stories', hue='bias', palette='rainbow')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
plt.tight_layout() plt.tight_layout()
plt.savefig(out_path / filename) plt.savefig(save_to)
print(f"saved: {filename}") plt.close()
print(f"saved: {save_to}")

View File

@ -1,48 +0,0 @@
from data.main import connect
import pandas as pd
import numpy as np
DB = connect()
edges = DB.query("""
select
*
from link_edges
""").df()
DB.close()
edges
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
DB = connect()
DB.query("create schema top")
DB.query("""
CREATE OR REPLACE TABLE top.publishers AS
SELECT
p.*
FROM publishers p
JOIN select_publishers s
ON s.publisher_id = p.id
""")
DB.query("""
CREATE OR REPLACE TABLE top.stories AS
SELECT
s.*
FROM stories s
JOIN top.publishers p
ON s.publisher_id = p.id
WHERE year(s.published_at) >= 2006
AND year(s.published_at) < 2023
""")
DB.query("""
CREATE OR REPLACE TABLE top.related_stories AS
SELECT
r.*
FROM top.stories s
JOIN related_stories r
ON s.id = r.parent_id
""")

View File

@ -1,7 +1,7 @@
from transformers import AutoTokenizer, AutoModel from transformers import AutoTokenizer, AutoModel
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from data.main import connect, data_dir from data.main import connect, paths
import os import os
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
@ -62,7 +62,7 @@ def embed(chunks):
ids = np.concatenate(embedding_ids) ids = np.concatenate(embedding_ids)
# save embeddings # save embeddings
save_to = data_dir() / 'embeddings.npy' save_to = paths('data') / 'embeddings.npy'
np.save(save_to, embeddings) np.save(save_to, embeddings)
print(f"embeddings saved: {save_to}") print(f"embeddings saved: {save_to}")
@ -75,15 +75,15 @@ def embed(chunks):
@click.command('sentence:create-avg-pca-table') @click.command('sentence:create-avg-pca-table')
def create_avg_pca_table(): def create_avg_pca_table():
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
embeddings = np.load(data_path / 'embeddings.npy') embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = np.load(data_path / 'embedding_ids.npy') embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index() ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
DB = connect()
data = DB.query(""" with connect() as db:
data = db.query("""
SELECT SELECT
ids.index ids.index
,s.id ,s.id
@ -97,7 +97,6 @@ def create_avg_pca_table():
JOIN bias_ratings b JOIN bias_ratings b
ON b.id = pb.bias_id ON b.id = pb.bias_id
""").df() """).df()
DB.close()
results = [] results = []
for publisher_id, group in data.groupby(['publisher_id']): for publisher_id, group in data.groupby(['publisher_id']):
@ -115,8 +114,8 @@ def create_avg_pca_table():
results['second'] = pred[:, 1] results['second'] = pred[:, 1]
table_name = "top.publisher_embeddings_pca" table_name = "top.publisher_embeddings_pca"
DB = connect() with connect() as db:
DB.query(f""" db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS CREATE OR REPLACE TABLE {table_name} AS
SELECT SELECT
results.publisher_id as publisher_id results.publisher_id as publisher_id
@ -124,20 +123,19 @@ def create_avg_pca_table():
,results.second as second ,results.second as second
FROM results FROM results
""") """)
DB.close()
print(f"created {table_name}") print(f"created {table_name}")
@click.command('sentence:create-pca-table') @click.command('sentence:create-pca-table')
def create_pca_table(): def create_pca_table():
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
embeddings = np.load(data_path / 'embeddings.npy') embeddings = np.load(path('data') / 'embeddings.npy')
embedding_ids = np.load(data_path / 'embedding_ids.npy') embedding_ids = np.load(path('data') / 'embedding_ids.npy')
DB = connect() with connect() as db:
data = DB.query(""" data = db.query("""
SELECT SELECT
ids.index ids.index
,s.id ,s.id
@ -150,12 +148,11 @@ def create_pca_table():
JOIN bias_ratings b JOIN bias_ratings b
ON b.id = pb.bias_id ON b.id = pb.bias_id
""").df() """).df()
pub = DB.query(""" pub = db.query("""
SELECT SELECT
* *
FROM top.publishers FROM top.publishers
""").df() """).df()
DB.close()
x = embeddings[data['index']] x = embeddings[data['index']]
y = data['ordinal'].to_numpy().reshape(-1, 1) y = data['ordinal'].to_numpy().reshape(-1, 1)
@ -166,8 +163,8 @@ def create_pca_table():
table_name = f"top.story_embeddings_pca" table_name = f"top.story_embeddings_pca"
DB = connect() with connect() as db:
DB.query(f""" db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS CREATE OR REPLACE TABLE {table_name} AS
SELECT SELECT
data.id as story_id data.id as story_id
@ -175,21 +172,20 @@ def create_pca_table():
,data.second as second ,data.second as second
FROM data FROM data
""") """)
DB.close()
print(f"created {table_name}") print(f"created {table_name}")
@click.command('sentence:create-svm-table') @click.command('sentence:create-svm-table')
def create_svm_table(): def create_svm_table():
from sklearn import svm from sklearn import svm
from sklearn.linear_model import SGDClassifier from sklearn.linear_model import SGDClassifier
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
embeddings = np.load(data_path / 'embeddings.npy') embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = np.load(data_path / 'embedding_ids.npy') embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index() ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
DB = connect() with connect() as db:
data = DB.query(""" data = db.query("""
SELECT SELECT
ids.index ids.index
,s.id ,s.id