Merge branch 'feature_factcheck'

This commit is contained in:
matt 2023-06-01 09:44:28 -07:00
commit 81f4f37c9d
40 changed files with 1354 additions and 1137 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 21 KiB

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 148 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 235 KiB

After

Width:  |  Height:  |  Size: 73 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 202 KiB

After

Width:  |  Height:  |  Size: 104 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 44 KiB

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 30 KiB

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 87 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 51 KiB

After

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 128 KiB

After

Width:  |  Height:  |  Size: 128 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 22 KiB

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB

After

Width:  |  Height:  |  Size: 53 KiB

27
src/apriori.py Normal file
View File

@ -0,0 +1,27 @@
from efficient_apriori import apriori
from data.main import connect
@click.command("apriori:rules")
def rules():
DB = connect()
data = DB.query("""
SELECT
--list_prepend(parent.id, list(child.id)) as transaction
list_prepend(parent.tld, list(child.tld)) as transaction
FROM stories s
JOIN related_stories r
ON r.parent_id = s.id
JOIN publishers parent
ON parent.id = s.publisher_id
JOIN publishers child
ON child.id = r.publisher_id
GROUP BY
--parent.id
parent.tld
""").df()
DB.close()
transactions = data.transaction.apply(lambda x: tuple(x)).values
itemsets, rules = apriori(transactions, min_support=0.1, min_confidence=0.8)
print(*rules, sep="\n")

View File

@ -1,67 +1,42 @@
import click
from data.main import connect
from data.main import connect, paths
import pandas as pd
from lxml import etree
from pathlib import Path
import os
import csv
def label_to_int(rating:str) -> int:
mapping = {
'left' : 0,
'left-center' : 1,
'center' : 2,
'right-center' : 3,
'right' : 4,
'allsides' : -1,
}
return mapping[rating]
def int_to_label(class_id: int) -> str:
mapping = {
0 : 'left',
1 : 'left-center',
2 : 'center',
3 : 'right-center',
4 : 'right',
-1 : 'allsides',
}
return mapping[class_id]
@click.command(name="bias:normalize")
def normalize() -> None:
DB = connect()
DB.sql("""
CREATE OR REPLACE TABLE publisher_bias AS
WITH cte AS (
SELECT
p.id as publisher_id
,b.id as bias_id
,b.bias as label
,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity
FROM bias_ratings b
JOIN top.publishers p
ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95
),ranked AS (
with connect() as db:
db.sql("""
CREATE OR REPLACE TABLE publisher_bias AS
WITH cte AS (
SELECT
p.id as publisher_id
,b.id as bias_id
,b.bias as label
,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity
FROM bias_ratings b
JOIN top.publishers p
ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95
),ranked AS (
SELECT
publisher_id
,bias_id
,label
,similarity
,ROW_NUMBER() OVER(PARTITION BY publisher_id ORDER BY similarity DESC) AS rn
FROM cte
)
SELECT
publisher_id
,bias_id
,label
,similarity
,ROW_NUMBER() OVER(PARTITION BY publisher_id ORDER BY similarity DESC) AS rn
FROM cte
)
SELECT
publisher_id
,label
,bias_id
FROM ranked
WHERE ranked.rn = 1
""")
,bias_id
FROM ranked
WHERE ranked.rn = 1
""")
mapping = [
{'label' :'left' , 'ordinal': -2},
@ -72,22 +47,20 @@ def normalize() -> None:
]
mapping = pd.DataFrame(mapping)
DB.query("alter table bias_ratings add column ordinal int")
DB.query("""
update bias_ratings b
set ordinal = o.ordinal
FROM mapping o
WHERE o.label = b.bias
""")
with connect() as db:
db.query("alter table bias_ratings add column ordinal int")
db.query("""
update bias_ratings b
set ordinal = o.ordinal
FROM mapping o
WHERE o.label = b.bias
""")
@click.command(name='bias:parse')
def parse() -> None:
"""parse the save html page of allslides.com bias ratings into a normalized csv file"""
DB = connect()
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
bias_html = DATA_DIR / 'allsides.html'
bias_html = paths('data') / 'allsides.html'
parser = etree.HTMLParser()
tree = etree.parse(str(bias_html), parser)
@ -111,65 +84,63 @@ def parse() -> None:
rating['disagree'] = int(disagree)
ratings.append(rating)
df = pd.DataFrame(ratings)
df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
df.to_csv(paths('data') / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
@click.command(name="bias:load")
def load() -> None:
DB = connect()
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
f = str(DATA_DIR / "bias_ratings.csv")
f = str(paths('data') / "bias_ratings.csv")
DB.sql(f"""
CREATE TABLE bias_ratings as
select
row_number() over(order by b.publisher) as id
,b.*
from read_csv_auto('{f}') b
""")
with connect() as db:
db.sql(f"""
CREATE TABLE bias_ratings as
select
row_number() over(order by b.publisher) as id
,b.*
from read_csv_auto('{f}') b
""")
@click.command('bias:export')
def export():
data_path = Path(os.environ['DATA_MINING_DATA_DIR'])
with connect() as db:
all_bias = db.query("""
SELECT
id as bias_id
,publisher as name
,bias as label
FROM bias_ratings
ORDER by agree desc
""")
DB = connect()
all_bias = DB.query("""
SELECT
id as bias_id
,publisher as name
,bias as label
FROM bias_ratings
ORDER by agree desc
all_bias.df().to_csv(paths('data') / 'TMP_publisher_bias.csv', sep="|", index=False)
with connect() as db:
mapped_bias = db.query("""
SELECT
p.id as publisher_id
,p.name as name
,p.tld as tld
,b.label as bias
,b.bias_id as bias_id
FROM top.publishers p
LEFT JOIN publisher_bias b
ON b.publisher_id = p.id
""")
all_bias.df().to_csv(data_path / 'TMP_publisher_bias.csv', sep="|", index=False)
mapped_bias = DB.query("""
SELECT
p.id as publisher_id
,p.name as name
,p.tld as tld
,b.label as bias
,b.bias_id as bias_id
FROM top.publishers p
LEFT JOIN publisher_bias b
ON b.publisher_id = p.id
""")
mapped_bias.df().to_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|", index=False)
DB.close()
mapped_bias.df().to_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|", index=False)
@click.command('bias:import-mapped')
def import_mapped():
data_path = Path(os.environ['DATA_MINING_DATA_DIR'])
table_name = "top.publisher_bias"
DB = connect()
df = pd.read_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|")
df = pd.read_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|")
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
publisher_id AS publisher_id
,cast(bias_id AS int) as bias_id
FROM df
WHERE bias_id IS NOT NULL
""")
DB.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
publisher_id AS publisher_id
,cast(bias_id AS int) as bias_id
FROM df
WHERE bias_id IS NOT NULL
""")
print(f"created table: {table_name}")

View File

@ -1,5 +1,7 @@
import click
from dotenv import load_dotenv
import data
import plots
@click.group()
def cli():
@ -7,12 +9,20 @@ def cli():
if __name__ == "__main__":
load_dotenv()
from data import scrape
cli.add_command(scrape.download)
cli.add_command(scrape.parse)
cli.add_command(scrape.load)
cli.add_command(scrape.normalize)
cli.add_command(scrape.create_elections_table)
# original bias ratings
cli.add_command(data.scrape.download)
cli.add_command(data.scrape.parse)
cli.add_command(data.scrape.load)
cli.add_command(data.scrape.normalize)
cli.add_command(data.scrape.create_elections_table)
cli.add_command(data.factcheck.parse_index)
cli.add_command(data.factcheck.scrape)
cli.add_command(data.links.create_table)
cli.add_command(data.links.create_pca)
cli.add_command(data.links.create_clusters)
import word
# cli.add_command(word.distance)
@ -23,10 +33,12 @@ if __name__ == "__main__":
cli.add_command(bias.parse)
cli.add_command(bias.load)
cli.add_command(bias.normalize)
import mine
cli.add_command(mine.embeddings)
cli.add_command(mine.cluster)
cli.add_command(mine.plot)
import emotion
cli.add_command(emotion.extract)
cli.add_command(emotion.normalize)
@ -40,34 +52,20 @@ if __name__ == "__main__":
from train import main as train_main
cli.add_command(train_main.main)
import plots.descriptive as plotd
cli.add_command(plotd.articles_per_year)
cli.add_command(plotd.distinct_publishers)
cli.add_command(plotd.stories_per_publisher)
cli.add_command(plotd.top_publishers)
cli.add_command(plotd.common_tld)
import links as linkcli
cli.add_command(linkcli.create_table)
cli.add_command(linkcli.create_pca)
cli.add_command(linkcli.create_clusters)
import plots.links as plotl
cli.add_command(plotl.elbow)
cli.add_command(plotl.link_pca_clusters)
import plots.classifier as plotc
cli.add_command(plotc.pca_with_classes)
import plots
cli.add_command(plots.descriptive.articles_per_year)
cli.add_command(plots.descriptive.distinct_publishers)
cli.add_command(plots.descriptive.stories_per_publisher)
cli.add_command(plots.descriptive.top_publishers)
cli.add_command(plots.descriptive.common_tld)
cli.add_command(plots.sentence.sentence_pca)
cli.add_command(plots.sentence.avg_sentence_pca)
cli.add_command(plots.emotion.emotion_over_time)
cli.add_command(plots.emotion.emotion_regression)
cli.add_command(plots.sentiment.over_time)
cli.add_command(plots.sentiment.bias_over_time)
cli.add_command(plots.sentiment.bias_vs_recent_winner)
cli.add_command(plots.links.elbow)
cli.add_command(plots.links.link_pca_clusters)
cli.add_command(plots.classifier.pca_with_classes)
cli()

View File

@ -1,6 +1,10 @@
import data.main
import data.scrape
import data.factcheck
import data.links
__all__ = [
'main'
,'scrape'
,'factcheck'
,'links'
]

171
src/data/factcheck.py Normal file
View File

@ -0,0 +1,171 @@
import requests
from lxml import etree
from bs4 import BeautifulSoup
import re
from io import BytesIO
import pandas as pd
from pathlib import Path
import os
import sys
import click
from data.main import connect, map_tld, paths
from random import randint
from time import sleep
from tqdm import tqdm
@click.command('mbfc:parse-index')
def parse_index():
parser = etree.HTMLParser()
publishers = []
for page in range(1, 54):
url = f"https://mediabiasfactcheck.com/filtered-search/?pg={page}"
print(f"downloading {url}", file=sys.stderr)
response = requests.get(url)
html = response.content
tree = etree.parse(BytesIO(html), parser)
rows = tree.xpath('//table[@class="mbfc-table"]/tbody/tr')
print(f"parsing {len(rows)} rows", file=sys.stderr)
for row in rows:
publisher = {}
link, bias, reporting, country, credibility, media_type, traffic, popularity = tuple(col for col in row.iterchildren())
link = link.xpath('./a')[0]
publisher['name'] = link.text
publisher['detail_url'] = link.get('href')
publisher['bias'] = bias.text
publisher['reporting'] = reporting.text
publisher['country'] = country.text
publisher['credibility'] = credibility.text
publisher['media_type'] = media_type.text
publisher['traffic'] = traffic.text
publisher['popularity'] = popularity.xpath('./span')[0].text
publishers.append(publisher)
df = pd.DataFrame(publishers)
save_to = paths('data') / 'mbfc_bias.csv'
df.to_csv(save_to, sep='|', index=False)
print(f"saved {len(df)}: {save_to}", file=sys.stderr)
@click.command("mbfc:schema")
def schema():
with connect() as db:
db.sql("""create schema mbfc""")
db.sql("""create or replace table mbfc.scrape (
url text
,scraped_at datetime default now()
)
""")
@click.command("mbfc:scrape")
def scrape():
df = pd.read_csv(paths('data') / 'mbfc_bias.csv', sep="|")
with connect() as db:
stats = db.query("""
select
count(1) filter(where s.url is not null) as elapsed
,count(1) filter(where s.url is null) as remaining
from df
left join mbfc.scrape s
on df.detail_url = s.url
""").fetchall()
df = db.query("""
select
detail_url as url
from df
where df.detail_url not in (
select
url
from mbfc.scrape
)
""").df()
print(f"{stats[0][0]} elapsed. {stats[0][1]} remaining.")
for url in df.url:
delay = randint(1,3)
save_as = paths('data') / 'mbfc' / (url.strip('/').split('/')[-1] + '.html')
print(f"downloading (delay: {delay}): {url}", file=sys.stderr)
sleep(delay)
try:
response = requests.get(url)
except Exception as e:
print(f"request failed: {url}", file=sys.stderr)
continue
with open(save_as, 'w') as f:
f.write(response.text)
with connect() as db:
db.execute("""insert into mbfc.scrape (url) values (?)""", [url])
print(f"saved: {save_as}", file=sys.stderr)
def load():
publishers = []
for i, page in enumerate(tqdm((paths('data') / 'mbfc').iterdir())):
publisher = {}
publisher['origin_url'] = f"https://mediabiasfactcheck.com/{page.stem}"
with page.open() as p:
tree = BeautifulSoup(p, 'html.parser')
for e in tree(string=re.compile(r'source:', re.IGNORECASE)):
e = e.parent
while e.name != 'p':
e = e.parent
l = e.find('a')
if l:
publisher['tld'] = l.get('href')
break
else:
breakpoint()
publishers.append(publisher)
df = pd.DataFrame(publishers)
df.to_csv(paths('data') / 'mbfc_publisher_url.csv', index=False, sep="|")
@click.command('mbfc:create-tables')
def create_tables():
pubs = pd.read_csv(paths('data') / 'mbfc_publishers.csv', sep='|')
urls = pd.read_csv(paths('data') / 'mbfc_publisher_url.csv', sep="|")
df = pubs.merge(urls, on='mbfc_url')
df['tld'] = df.tld.apply(map_tld)
df['ordinal'] = df.bias.apply(bias_label_to_int)
with connect() as db:
db.sql("""
CREATE OR REPLACE TABLE mbfc.publishers AS
SELECT
row_number() over() as id
,p.tld
,mode(p.name) as name
,mode(p.bias) as bias
,mode(p.ordinal) as ordinal
,mode(p.reporting) as reporting
,mode(p.country) as country
,mode(p.credibility) as credibility
,mode(p.media_type) as media_type
,mode(p.traffic) as traffic
,mode(p.popularity) as popularity
FROM df p
GROUP BY
p.tld
""")
with connect() as db:
raw_stories = db.sql("""
SELECT
*
FROM stories s
""").df()
stories['tld'] = stories.url.apply(map_tld)
with connect() as db:
db.sql("""
CREATE OR REPLACE TABLE mbfc.publisher_stories AS
SELECT
s.id as story_id
,p.id as publisher_id
FROM raw_stories s
JOIN mbfc.publishers p
ON p.tld = s.tld
""")

135
src/data/links.py Normal file
View File

@ -0,0 +1,135 @@
import click
from data.main import connect
import pandas as pd
@click.command('links:create-table')
def create_table():
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE link_edges AS
with cte as(
SELECT
s.publisher_id as parent_id
,r.publisher_id as child_id
,count(1) as links
FROM stories s
JOIN related_stories r
ON s.id = r.parent_id
group by
s.publisher_id
,r.publisher_id
)
SELECT
cte.parent_id
,cte.child_id
,cte.links as links
,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
,case when cte.links > 0 then 1 else 0 end as onehot
FROM cte
WHERE cte.child_id in (
SELECT
distinct parent_id
FROM cte
)
AND cte.parent_id in (
SELECT
distinct child_id
FROM cte
)
""")
db.query("""
SELECT
*
,count(1) over()
FROM link_edges e
limit 1
""")
print(f"created link_edges")
@click.command('links:create-pca')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_pca(source):
"""create 2D pca labels"""
from sklearn.decomposition import PCA
table_name = f"publisher_pca_{source}"
with connect() as db:
pub = db.query("""
SELECT
p.*
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON p.id = ps.publisher_id
""").df()
df = db.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM link_edges
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
svd = PCA(n_components=2)
svd_out = svd.fit_transform(pivot)
out = pivot.reset_index()[['parent_id']]
out['first'] = svd_out[:, 0]
out['second'] = svd_out[:, 1]
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
out.id as publisher_id
,out.first as first
,out.second as second
FROM out
""")
print(f"created {table_name}")
@click.command('links:create-clusters')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_clusters(source):
"""create link adj. matrix clusters table"""
from sklearn.cluster import KMeans
table_name = f"publisher_clusters_{source}"
with connect() as db:
df = db.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM link_edges
""").df()
pub = db.query("""
SELECT
p.*
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
k = 8
kmeans = KMeans(n_clusters=k, n_init="auto")
pred = kmeans.fit_predict(pivot)
out = pivot.reset_index()[['parent_id']]
out['label'] = pred
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
new_table = out[['id', 'label']]
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
n.id as publisher_id
,n.label as label
FROM new_table n
""")
print(f"created {table_name}")

View File

@ -2,6 +2,10 @@ import os
from pathlib import Path
import duckdb
from enum import Enum
from urllib.parse import urlparse
from tld import get_tld
from tld.utils import update_tld_names
import sys
class Data(str, Enum):
Titles = 'titles'
@ -9,6 +13,16 @@ class Data(str, Enum):
def data_dir():
return Path(os.environ['DATA_MINING_DATA_DIR'])
def paths(name='app'):
if 'app' in name:
return Path(os.environ['DATA_MINING_APP_DIR'])
if 'data' in name:
return Path(os.environ['DATA_MINING_DATA_DIR'])
if 'doc' in name:
return Path(os.environ['DATA_MINING_DOCS_DIR'])
if 'figure' in name:
return Path(os.environ['DATA_MINING_DOCS_DIR']) / 'figures'
def connect():
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
# APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
@ -28,3 +42,66 @@ def from_db(t: Data):
limit 100
""").df()
return table
def map_tld(x):
try:
res = get_tld(x, as_object=True)
return res.fld
except:
print(f"'{x}' is not valid.", file=sys.stderr)
return None
def ticklabels():
return [
'Left',
'Left-Center',
'Least Biased',
'Right-Center',
'Right',
]
def bias_label_to_int(rating:str, source: str = 'mbfc') -> int:
if source == 'mbfc':
mapping = {
'Left' : 0,
'Left-Center' : 1,
'Least Biased' : 2,
'Right-Center' : 3,
'Right' : 4,
}
else:
mapping = {
'left' : 0,
'left-center' : 1,
'center' : 2,
'right-center' : 3,
'right' : 4,
}
try:
return mapping[rating]
except:
print(f"no mapping for {rating}", file=sys.stderr)
return -1
def bias_int_to_label(class_id: int, source: str = 'mbfc') -> str:
if source == 'mbfc':
mapping = {
0 : 'Left',
1 : 'Left-Center',
2 : 'Least Biased',
3 : 'Right-Center',
4 : 'Right',
}
else:
mapping = {
0 : 'left',
1 : 'left-center',
2 : 'center',
3 : 'right-center',
4 : 'right',
}
try:
return mapping[class_id]
except:
print(f"no mapping for {class_id}", file=sys.stderr)
return -1

View File

@ -319,12 +319,6 @@ def another_norm():
""")
def map_tld(x):
try:
res = get_tld(x, as_object=True)
return res.fld
except:
return None
DB.sql("""
SELECT

47
src/data/selection.py Normal file
View File

@ -0,0 +1,47 @@
from data.main import connect
import pandas as pd
import numpy as np
def create_tables():
with connect() as db:
edges = db.query("""
select
*
from link_edges
""").df()
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
with connect() as db:
db.query("create schema top")
db.query("""
CREATE OR REPLACE TABLE top.publishers AS
SELECT
p.*
FROM publishers p
JOIN select_publishers s
ON s.publisher_id = p.id
""")
db.query("""
CREATE OR REPLACE TABLE top.stories AS
SELECT
s.*
FROM stories s
JOIN top.publishers p
ON s.publisher_id = p.id
WHERE year(s.published_at) >= 2006
AND year(s.published_at) < 2023
""")
db.query("""
CREATE OR REPLACE TABLE top.related_stories AS
SELECT
r.*
FROM top.stories s
JOIN related_stories r
ON s.id = r.parent_id
""")

View File

@ -1,10 +1,11 @@
import click
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import torch.nn.functional as F
from data import connect, data_dir
from data.main import connect, paths
import numpy as np
from tqdm import tqdm
import click
import pandas as pd
@click.option('-c', '--chunks', type=int, default=500, show_default=True)
@click.command("sentiment:extract")
@ -67,20 +68,19 @@ def extract(chunks):
@click.command('sentiment:load')
def load():
DB = connect()
sentiments = np.load(data_dir() / 'sentiment.npy')
story_ids = np.load(data_dir() / 'sentiment_ids.npy')
sentiments = np.load(paths('data') / 'sentiment.npy')
story_ids = np.load(paths('data') / 'sentiment_ids.npy')
data = pd.DataFrame(story_ids, columns=['story_id']).reset_index()
data['sentiment_id'] = sentiments
DB.query("""
CREATE OR REPLACE TABLE top.story_sentiments AS
SELECT
data.story_id
,data.sentiment_id as class_id
,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label
FROM data
JOIN top.stories s
ON s.id = data.story_id
""")
DB.close()
with connect() as db:
db.query("""
CREATE OR REPLACE TABLE story_sentiments AS
SELECT
data.story_id
,data.sentiment_id as class_id
,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label
FROM data
JOIN stories s
ON s.id = data.story_id
""")

View File

@ -1,255 +0,0 @@
import click
from data.main import connect
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
@click.command('links:create-table')
def create_table():
table_name = "top.link_edges"
DB = connect()
DB.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
with cte as(
SELECT
s.publisher_id as parent_id
,r.publisher_id as child_id
,count(1) as links
FROM top.stories s
JOIN top.related_stories r
ON s.id = r.parent_id
group by
s.publisher_id
,r.publisher_id
)
SELECT
cte.parent_id
,cte.child_id
,cte.links as links
,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
,case when cte.links > 0 then 1 else 0 end as onehot
FROM cte
WHERE cte.child_id in (
SELECT
distinct parent_id
FROM cte
)
AND cte.parent_id in (
SELECT
distinct child_id
FROM cte
)
""")
DB.close()
DB = connect()
DB.query("""
SELECT
*
,-log10(links)
--distinct parent_id
FROM top.link_edges e
WHERE e.parent_id = 238
""")
DB.close()
print(f"created {table_name}")
@click.command('links:create-pca')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_pca(source):
"""create 2D pca labels"""
from sklearn.decomposition import PCA
table_name = f"top.publisher_pca_{source}"
DB = connect()
pub = DB.query("""
SELECT
*
FROM top.publishers
""").df()
df = DB.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM top.link_edges
""").df()
DB.close()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
svd = PCA(n_components=2)
svd_out = svd.fit_transform(pivot)
out = pivot.reset_index()[['parent_id']]
out['first'] = svd_out[:, 0]
out['second'] = svd_out[:, 1]
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
DB = connect()
DB.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
out.id as publisher_id
,out.first as first
,out.second as second
FROM out
""")
DB.close()
print(f"created {table_name}")
@click.command('links:create-clusters')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_clusters(source):
from sklearn.cluster import KMeans
table_name = f"top.publisher_clusters_{source}"
DB = connect()
df = DB.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM top.link_edges
""").df()
pub = DB.query("""
SELECT
*
FROM top.publishers
""").df()
DB.close()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
k = 8
kmeans = KMeans(n_clusters=k, n_init="auto")
pred = kmeans.fit_predict(pivot)
out = pivot.reset_index()[['parent_id']]
out['label'] = pred
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
new_table = out[['id', 'label']]
DB = connect()
DB.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
n.id as publisher_id
,n.label as label
FROM new_table n
""")
DB.close()
print(f"created {table_name}")
def to_matrix():
"""returns an adjacency matrix of publishers to publisher link frequency"""
DB = connect()
bias_map = pd.DataFrame([
{'label' :'left', 'value' : 0},
{'label' :'left-center', 'value' : 1},
{'label' :'center', 'value' : 2},
{'label' :'right-center', 'value' : 3},
{'label' :'right', 'value' : 4},
{'label' :'allsides', 'value' : -1},
])
bias = DB.sql("""
SELECT
b.id
,b.label
,m.value
FROM publisher_bias b
JOIN bias_map m
ON b.label = m.label
WHERE value != -1
""").df()
pub = DB.sql("""
select
p.id
,p.name
,p.url
from publishers p
""").df()
edges = DB.sql("""
WITH total as (
SELECT
s.publisher_id as id
,COUNT(1) as stories
FROM stories s
GROUP BY
s.publisher_id
), p as (
SELECT
p.id
,stories
FROM publishers p
LEFT JOIN total t
ON t.id = p.id
WHERE t.stories >= 20
), cte as (
SELECT
r.publisher_id as child_id
,s.publisher_id as parent_id
,count(1) as links
FROM related_stories r
JOIN stories s
ON s.id = r.parent_id
group by
s.publisher_id
,r.publisher_id
)
SELECT
p.id as parent_id
,cte.child_id
,links
FROM p
left JOIN cte
ON p.id = cte.parent_id
""").df()
# only keep values that have more than 1 link
test = edges[edges['links'] > 2].pivot(index='parent_id', columns='child_id', values='links').fillna(0).reset_index()
edges.dropna().pivot(index='parent_id', columns='child_id', values='links').fillna(0)
pd.merge(adj, pub, how='left', left_on='parent_id', right_on='id')
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
adj.values.shape
out = pd.DataFrame(adj.index.values, columns=['id'])
out = pd.merge(out, pub, how='left', on='id')
return out
@click.command('links:analysis')
def analysis():
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import MiniBatchKMeans
adj = to_matrix()
pca = PCA(n_components=4)
pca_out = pca.fit_transform(adj)
svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
svd_out = svd.fit_transform(adj)
x = svd_out[:, 0]
y = svd_out[:, 1]
x = pca_out[:, 0]
y = pca_out[:, 1]
sns.scatterplot(x=x, y=y)
plt.show()
kmeans = MiniBatchKMeans(n_clusters=5, random_state=0, batch_size=6, n_init="auto")
pred = kmeans.fit_predict(pca_out)
sns.scatterplot(x=x, y=y, hue=pred)
plt.show()
sns.scatterplot(x=pub['first'], y=pub['second'], hue=pub['bias'])
plt.show()

View File

@ -1,6 +1,5 @@
from data.main import data_dir, connect
from data.main import connect, paths
import numpy as np
import sklearn
from sklearn.cluster import MiniBatchKMeans
import click
from pathlib import Path
@ -11,7 +10,7 @@ from enum import Enum, auto
@click.command(name="mine:embeddings")
def embeddings():
data = np.load(data_dir() / "embeddings.npy")
data = np.load(paths('data') / "embeddings.npy")
kmeans = MiniBatchKMeans(n_clusters=5,
random_state=0,
batch_size=6,
@ -76,7 +75,7 @@ class PlotName(str, Enum):
@click.option('-n', '--name', required=True, type=click.Choice(PlotName))
@click.option('-o', '--output', required=False, type=click.Path())
def plot(name: PlotName, output: Path):
output = output if output else APP_DIR / f'docs/{name}.png'
output = output if output else paths('figures') / f'{name}.png'
if name == PlotName.TitleLength:
fig, ax = plt.subplots(1,1)
data = db.sql("""

36
src/mining/bias.py Normal file
View File

@ -0,0 +1,36 @@
from data.main import connect, map_tld
import os
from pathlib import Path
def normalize():
with connect() as db:
db.sql("""
SELECT
p.name
,count(1) as ctn
,sum(ctn) over() as all
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
JOIN stories s
ON s.id = ps.story_id
GROUP BY
p.name
""")
with connect() as db:
db.sql("""
SELECT
bias
,count(distinct p.id) as publishers
,count(1) as stories
,count(1) / count(distinct p.id) as ratio
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
JOIN stories s
ON s.id = ps.story_id
GROUP BY
p.bias
ORDER BY count(1)
""")

View File

@ -1,9 +1,13 @@
import plots.sentence
import plots.emotion
import plots.sentiment
import plots.links
import plots.classifier
__all__ = [
'sentence'
'emotion',
'sentiment',
'links',
'classifier',
]

View File

@ -1,5 +1,5 @@
import click
from data.main import connect
from data.main import connect, bias_label_to_int, ticklabels
import os
from pathlib import Path
import seaborn as sns
@ -7,54 +7,53 @@ import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:bias-hist')
def hist():
filename = "bias_hist.png"
save_to = paths('figures') / "bias_hist.png"
with connect() as db:
data = db.sql("""
SELECT
p.ordinal
,count(1) as stories
FROM stories s
JOIN mbfc.publisher_stories ps
ON s.id = ps.story_id
JOIN mbfc.publishers p
ON ps.publisher_id = p.id
WHERE ordinal != -1
GROUP BY
p.ordinal
""").df()
DB = connect()
data = DB.sql("""
SELECT
b.ordinal
,count(1) as stories
FROM stories s
JOIN publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
GROUP BY
b.ordinal
""").df()
DB.close()
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
plt.tight_layout()
plt.savefig(out_path / filename)
print(f"saved: {filename}")
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
@click.command('plot:bias-publisher-hist')
def publisher_hist():
filename = "bias_publisher_hist.png"
save_to = paths('figures') / "bias_publisher_hist.png"
DB = connect()
data = DB.sql("""
SELECT
b.ordinal
,count(1) as publishers
FROM publisher_bias pb
JOIN bias_ratings b
ON b.id = pb.bias_id
GROUP BY
b.ordinal
""").df()
DB.close()
with connect() as db:
data = db.sql("""
SELECT
p.ordinal
,count(distinct p.id) as publishers
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
WHERE ordinal != -1
GROUP BY
p.ordinal
""").df()
ax = sns.barplot(x=data['ordinal'], y=data['publishers'], color='tab:blue')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels)
ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels())
plt.tight_layout()
plt.savefig(out_path / filename)
plt.savefig(save_to)
plt.close()
print(f"saved: {filename}")
print(f"saved: {save_to}")

View File

@ -5,30 +5,32 @@ import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:pca-with-classes')
def pca_with_classes():
filename = "pca_with_classes.png"
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def pca_with_classes(source):
DB = connect()
data = DB.query(f"""
SELECT
p.tld
,b.bias
,c.first
,c.second
,round(cast(b.agree as float) / (b.agree + b.disagree), 2) ratio
FROM top.publishers p
JOIN top.publisher_bias pb
ON p.id = pb.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
JOIN top.publisher_pca_normalized c
ON c.publisher_id = p.id
""").df()
DB.close()
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['bias'], s=100)
ax.set(title="pca components vs. bias labels", xlabel="first pca component", ylabel="second pca component")
plt.savefig(out_dir / filename)
print(f"saved: {filename}")
save_to = paths('figures') / f"link_{source}_pca_with_classes.png"
with connect() as db:
df = db.query(f"""
SELECT
p.tld
,p.bias
,c.first
,c.second
--,round(cast(b.agree as float) / (b.agree + b.disagree), 2) ratio
FROM mbfc.publishers p
JOIN publisher_pca_{source} c
ON c.publisher_id = p.id
WHERE p.ordinal != -1
ORDER BY p.ordinal
""").df()
ax = sns.relplot(df, x='first', y='second', hue='bias', col='bias', s=100, palette='rainbow')
ax.set(xlabel="first pca component",
ylabel="second pca component")
ax.figure.suptitle="pca components vs. bias labels"
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
os.system(f'xdg-open {save_to}')

View File

@ -1,169 +1,190 @@
import click
from data.main import connect
from data.main import connect, paths
import os
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:articles-per-year')
def articles_per_year():
filename = 'articles_per_year.png'
save_to = paths('figures') / 'articles_per_year.png'
DB = connect()
data = DB.query("""
select
year(published_at) as year
,count(1) as stories
from stories
group by
year(published_at)
""").df()
DB.close()
with connect() as db:
data = DB.query("""
select
year(published_at) as year
,count(1) as stories
from stories
group by
year(published_at)
""").df()
ax = sns.barplot(x=data.year, y=data.stories, color='tab:blue')
ax.tick_params(axis='x', rotation=90)
ax.set(title="count of articles per year", ylabel="count of stories (#)")
plt.tight_layout()
plt.savefig(out_dir / filename)
plt.savefig(save_to)
print(f"saved: {save_to}")
@click.command('plot:distinct-publishers')
def distinct_publishers():
filename = 'distinct_publishers.png'
save_to = paths('figures') / 'distinct_publishers.png'
DB = connect()
data = DB.query("""
select
year(published_at) as year
,count(distinct publisher_id) as publishers
from stories
group by
year(published_at)
""").df()
DB.close()
with connect() as db:
data = DB.query("""
select
year(published_at) as year
,count(distinct publisher_id) as publishers
from stories
group by
year(published_at)
""").df()
ax = sns.barplot(x=data.year, y=data.publishers, color='tab:blue')
ax.tick_params(axis='x', rotation=90)
ax.set(title="count of publishers per year", ylabel="count of publishers (#)")
plt.tight_layout()
plt.savefig(out_dir / filename)
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
@click.command('plot:stories-per-publisher')
def stories_per_publisher():
filename = 'stories_per_publisher.png'
save_to = paths('figures') / 'stories_per_publisher.png'
DB = connect()
data = DB.query("""
with cte as (
select
publisher_id
,year(published_at) as year
,count(1) as stories
from stories
group by
publisher_id
,year(published_at)
) , agg as (
with connect() as db:
data = db.query("""
with cte as (
select
publisher_id
,avg(stories) as stories_per_year
,case
when avg(stories) < 2 then 2
when avg(stories) < 4 then 4
when avg(stories) < 8 then 8
when avg(stories) < 16 then 16
when avg(stories) < 32 then 32
when avg(stories) < 64 then 64
when avg(stories) < 128 then 128
else 129
end as max_avg
from cte
ps.publisher_id
,year(s.published_at) as year
,count(1) as stories
from stories s
join mbfc.publisher_stories ps
on ps.story_id = s.id
group by
publisher_id
)
select
max_avg
,count(1) as publishers
from agg
group by
max_avg
""").df()
DB.close()
ps.publisher_id
,year(s.published_at)
) , agg as (
select
publisher_id
,avg(stories) as stories_per_year
,case
when avg(stories) < 2 then 2
when avg(stories) < 4 then 4
when avg(stories) < 8 then 8
when avg(stories) < 16 then 16
when avg(stories) < 32 then 32
when avg(stories) < 64 then 64
when avg(stories) < 128 then 128
else 129
end as max_avg
from cte
group by
publisher_id
)
select
max_avg
,count(1) as publishers
from agg
group by
max_avg
""").df()
ax = sns.barplot(x=data.max_avg, y=data.publishers, color='tab:blue')
ax.set(title="histogram of publisher stories per year", ylabel="count of publishers (#)", xlabel="max average stories / year")
ax.set(title="histogram of publisher stories per year", ylabel="count of publishers (#)", xlabel="avg. stories / year", xticklabels=['2', '4', '8', '16', '32', '64', '128', '>128'])
plt.tight_layout()
plt.savefig(out_dir / filename)
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
@click.command('plot:top-publishers')
def top_publishers():
"""plot top publishers over time"""
filename = 'top_publishers.png'
save_to = paths('figures') / 'top_publishers.png'
DB = connect()
data = DB.query("""
select
p.tld
,year(published_at) as year
,count(1) as stories
from (
select
with connect() as db:
db.query("""
SELECT
p.tld
,p.id
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
JOIN stories s
ON s.id = ps.story_id
GROUP BY
p.tld
,p.id
order by count(1) desc
limit 20
""")
with connect() as db:
data = db.query("""
WITH p as (
SELECT
p.tld
,p.id
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
JOIN stories s
ON s.id = ps.story_id
GROUP BY
p.tld
,p.id
order by count(1) desc
limit 20
)
SELECT
p.tld
,p.id
from top.publishers p
join top.stories s
on s.publisher_id = p.id
group by
,YEAR(s.published_at) AS year
,COUNT(1) AS stories
FROM stories s
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN p
ON p.id = ps.publisher_id
GROUP by
p.tld
,p.id
order by count(1) desc
limit 20
) p
join top.stories s
on s.publisher_id = p.id
group by
p.tld
,year(published_at)
order by count(distinct s.id) desc
""").df()
DB.close()
,YEAR(published_at)
ORDER BY year, COUNT(DISTINCT s.id) DESC
""").df()
pivot = data.pivot(columns='year', index='tld', values='stories')
ax = sns.heatmap(pivot, cmap="crest")
ax.set(title="top 20 publishers (by tld)", ylabel="tld", xlabel="stories / year (#)")
plt.tight_layout()
plt.savefig(out_dir / filename)
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
@click.command('plot:common_tld')
def common_tld():
import dataframe_image as dfi
filename = 'common_tld.png'
save_to = paths('figures') / 'common_tld.png'
DB = connect()
data = DB.query("""
select
split_part(url, '.', -1) as tld
,count(1) as publishers
,case when count(1) < 20
then string_agg(distinct url, '\t')
else NULL
end as urls
from publishers
group by
split_part(url, '.', -1)
order by
count(1) desc
""").df()
DB.close()
data[:15][['tld', 'publishers']].style.hide(axis="index").export_png(out_dir / filename, table_conversion='matplotlib')
with connect() as db:
data = db.query("""
select
split_part(url, '.', -1) as tld
,count(1) as publishers
,case when count(1) < 20
then string_agg(distinct url, '\t')
else NULL
end as urls
from publishers
group by
split_part(url, '.', -1)
order by
count(1) desc
""").df()
data[:15][['tld', 'publishers']].style.hide(axis="index").export_png(save_to, table_conversion='matplotlib')
def stats():
@ -246,7 +267,7 @@ def stats():
@click.command('plot:bias-stats')
def bias_stats():
import dataframe_image as dfi
filename = 'bias_stats.png'
save_to = paths('figures') / 'bias_stats.png'
DB = connect()
@ -300,3 +321,69 @@ def bias_stats():
""").df()
DB.close()
print(df.to_markdown(index=False))
@click.command('plot:bias-over-time')
def bias_over_time():
"""plot bias labels over time"""
save_to = paths('figures') / 'bias_over_time.png'
with connect() as db:
df = db.sql("""
SELECT
p.bias
,p.id
,date_trunc('year', s.published_at) as year
,count(1) as stories
FROM stories s
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
where year(s.published_at) not in (2006, 2023)
and p.ordinal != -1
GROUP BY
p.bias
,p.id
,p.ordinal
,date_trunc('year', s.published_at)
order by
p.ordinal
,date_trunc('year', s.published_at)
""").df()
ax = sns.relplot(df, kind='line', x='year', y='stories', col='bias', units='id', estimator=None, palette='rainbow')
ax.set(ylabel="stories", xlabel="year")
plt.tight_layout()
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
def bias_missing():
with connect() as db:
df = db.sql("""
SELECT
date_trunc('year', s.published_at) as year
,s.tld
,count(1) as stories
FROM stories s
LEFT JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
WHERE ps.publisher_id is NULL
AND year(s.published_at) not in (2006, 2023)
GROUP BY
s.tld
,date_trunc('year', s.published_at)
HAVING count(1) > 10
ORDER BY
date_trunc('year', s.published_at)
""").df()
ax = sns.lineplot(df, x='year', y='stories', units='tld', estimator=None)
ax.set(ylabel="stories", xlabel="year")
plt.tight_layout()
plt.show()
#plt.savefig(save_to)
plt.close()
#print(f"saved: {save_to}")

View File

@ -1,77 +1,79 @@
import click
from data.main import connect
from data.main import connect, paths, ticklabels
import os
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:emotion-over-time')
def emotion_over_time():
filename = "emotion_over_time.png"
DB = connect()
emotions = DB.sql("""
SELECT
date_trunc('year', s.published_at) AS year
,e.label AS emotion
,count(1) AS stories
FROM top.stories s
JOIN story_emotions se
ON s.id = se.story_id
JOIN emotions e
ON e.id = se.emotion_id
GROUP by
date_trunc('year', s.published_at)
,e.label
""").df()
DB.close()
filename = "emotion_over_time.png"
save_to = paths('figures') / filename
with connect() as db:
emotions = db.sql("""
SELECT
date_trunc('year', s.published_at) AS year
,e.label AS emotion
,count(1) AS stories
FROM stories s
JOIN story_emotions se
ON s.id = se.story_id
JOIN emotions e
ON e.id = se.emotion_id
GROUP by
date_trunc('year', s.published_at)
,e.label
""").df()
ax = sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
ax.set(title="title emotions over years", xlabel="year", ylabel="stories (#)")
plt.savefig(out_path / filename)
print(f"saved: {filename}")
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
os.system(f'xdg-open {save_to}')
@click.command('plot:emotion-regression')
def emotion_regression():
"""plot emotion over time as regression"""
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
filename = "emotion_regression.png"
save_to = paths('figures') / filename
DB = connect()
emotions = DB.query("""
SELECT
label
FROM emotions e
""").df()['label'].to_list()
DB.close()
DB = connect()
df = DB.sql(f"""
SELECT
epoch(date_trunc('yearweek', s.published_at)) AS date
,e.id AS emotion_id
,p.id as publisher_id
,count(1) AS stories
FROM top.stories s
JOIN top.publishers p
ON p.id = s.publisher_id
JOIN story_emotions se
ON s.id = se.story_id
JOIN emotions e
ON e.id = se.emotion_id
GROUP by
epoch(date_trunc('yearweek', s.published_at))
,p.id
,e.id
""").df()
DB.close()
with connect() as db:
#emotions = db.query("""
# SELECT
# label
# FROM emotions e
#""").df()['label'].to_list()
df = db.sql(f"""
SELECT
epoch(date_trunc('yearweek', s.published_at)) AS date
,e.id AS emotion_id
,p.id as publisher_id
,count(1) AS stories
FROM stories s
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
JOIN story_emotions se
ON s.id = se.story_id
JOIN emotions e
ON e.id = se.emotion_id
WHERE p.ordinal != -1
GROUP by
epoch(date_trunc('yearweek', s.published_at))
,p.id
,e.id
""").df()
results = []
for (emotion_id, publisher_id), group in df.groupby(['emotion_id', 'publisher_id']):
@ -83,77 +85,59 @@ def emotion_regression():
results.append({'emotion_id' : emotion_id, 'publisher_id':publisher_id, 'per_year' : per_year})
results = pd.DataFrame(results)
DB = connect()
out = DB.query("""
SELECT
e.label as emotion
--,p.tld
,avg(results.per_year) as avg_reg_coef
,b.ordinal
FROM results
JOIN emotions e
ON e.id = results.emotion_id
JOIN top.publishers p
ON p.id = results.publisher_id
JOIN publisher_bias pb
ON pb.publisher_id = results.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
GROUP BY
e.label
,b.ordinal
""").df()
DB.close()
pivot = out.pivot(index=['emotion'], columns=['ordinal'], values=['avg_reg_coef'])
with connect() as db:
out = db.query("""
SELECT
e.label as emotion
,avg(results.per_year) as avg_reg_coef
,p.bias
FROM results
JOIN emotions e
ON e.id = results.emotion_id
JOIN mbfc.publishers p
ON p.id = results.publisher_id
GROUP BY
e.label
,p.bias
""").df()
ax = sns.heatmap(pivot, cmap='RdBu_r')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
pivot = out.pivot(index=['emotion'], columns=['bias'], values=['avg_reg_coef'])
ax = sns.heatmap(pivot, cmap='BrBG', vmin=-0.01, vmax=0.01, center=0)
#ax = sns.heatmap(pivot, cmap='RdBu_r', center=0)
ax.set(title="slope of regression (stories/year) by bias and emotion"
,xticklabels=ticklabels
,xticklabels=ticklabels()
,xlabel="bias"
,ylabel="emotion")
plt.tight_layout()
plt.savefig(out_path / filename)
print(f"saved: {filename}")
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
@click.command('plot:emotion-hist')
def emotion_hist():
filename = "emotion_hist.png"
save_to = paths('figures') / filename
DB = connect()
DB.query("""describe story_emotions""")
with connect() as db:
data = db.sql("""
SELECT
p.bias
,count(1) as stories
FROM stories s
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
WHERE p.ordinal != -1
GROUP BY
p.bias
""").df()
DB.query("""
select
e.label
,count(distinct s.id) as stories
,count(distinct s.publisher_id) as publishers
from story_emotions se
join emotions e
on e.id = se.emotion_id
join top.stories s
on s.id = se.story_id
group by
e.label
""").df().to_markdown(index=False)
data = DB.sql("""
SELECT
b.ordinal
,count(1) as stories
FROM stories s
JOIN publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
GROUP BY
b.ordinal
""").df()
DB.close()
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
ax = sns.barplot(data, x='bias', y='stories', palette='rainbow', order=ticklabels())
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
plt.tight_layout()
plt.savefig(out_path / filename)
print(f"saved: {filename}")
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")

View File

@ -9,20 +9,20 @@ import numpy as np
from sklearn.metrics import silhouette_score
import pandas as pd
out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:link-elbow')
def elbow():
from sklearn.cluster import KMeans
filename = 'link_cluster_elbow.png'
save_to = paths('figures') / 'link_cluster_elbow.png'
with connect() as db:
df = db.query("""
SELECT
*
FROM link_edges
""").df()
DB = connect()
df = DB.query("""
SELECT
*
FROM link_edges
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
to_plot = []
@ -36,8 +36,9 @@ def elbow():
ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia)
ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
plt.savefig(out_dir / filename)
plt.savefig(save_to)
plt.close()
print(f"saved plot: {save_to}")
# randomly pick 8
@ -45,72 +46,65 @@ def elbow():
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def link_pca_clusters(source):
filename = f"link_pca_clusters_{source}.png"
save_to = paths('figures') / f"link_pca_clusters_{source}.png"
DB = connect()
df = DB.query(f"""
SELECT
c.label as cluster
,p.tld
--,b.label as bias
,pca.first
,pca.second
,s.cnt as stories
FROM top.publisher_clusters_{source} c
JOIN top.publishers p
ON c.publisher_id = p.id
JOIN
(
select
s.publisher_id
,count(1) as cnt
FROM top.stories s
GROUP BY
s.publisher_id
) s
ON s.publisher_id = p.id
JOIN top.publisher_pca_{source} pca
ON pca.publisher_id = p.id
""").df()
DB.close()
with connect() as db:
df = db.query(f"""
SELECT
c.label as cluster
,p.tld
--,b.label as bias
,pca.first
,pca.second
,s.cnt as stories
FROM top.publisher_clusters_{source} c
JOIN top.publishers p
ON c.publisher_id = p.id
JOIN
(
select
s.publisher_id
,count(1) as cnt
FROM top.stories s
GROUP BY
s.publisher_id
) s
ON s.publisher_id = p.id
JOIN top.publisher_pca_{source} pca
ON pca.publisher_id = p.id
""").df()
ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster'])
ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component")
plt.savefig(out_dir / filename)
# .df().groupby(['cluster', 'bias']).describe()
plt.savefig(save_to)
print(f"saved plot: {save_to}")
def test():
data_dir = Path(os.getenv('DATA_MINING_DATA_DIR'))
DB.query("""
SELECT
p.id as publisher_id
,p.name
,p.tld
,cast(b.bias_id as int) as bias_id
,count(1) as stories
FROM publishers p
JOIN stories s
ON s.publisher_id = p.id
JOIN publisher_clusters c
ON c.publisher_id = p.id
LEFT JOIN publisher_bias b
ON b.publisher_id = p.id
where bias_id is null
group by
p.id
,p.name
,p.tld
,b.bias_id
ORDER BY count(1) desc
""")
# .df().to_csv(data_dir / 'cluster_publishers.csv', sep="|", index=False)
DB.close()
with connect() as db:
db.query("""
SELECT
p.id as publisher_id
,p.name
,p.tld
,cast(b.bias_id as int) as bias_id
,count(1) as stories
FROM publishers p
JOIN stories s
ON s.publisher_id = p.id
JOIN publisher_clusters c
ON c.publisher_id = p.id
LEFT JOIN publisher_bias b
ON b.publisher_id = p.id
where bias_id is null
group by
p.id
,p.name
,p.tld
,b.bias_id
ORDER BY count(1) desc
""")
@click.command('plot:link-confusion')
@ -120,34 +114,36 @@ def link_confusion():
from sklearn.metrics import ConfusionMatrixDisplay
filename = "link_confusion.png"
save_to = paths('figures') / filename
DB = connect()
bias = DB.query("""
SELECT
p.id as publisher_id
,b.ordinal
FROM top.publishers p
JOIN top.publisher_bias pb
ON pb.publisher_id = p.id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
with connect() as db:
bias = db.query("""
SELECT
p.id as publisher_id
,b.ordinal
FROM top.publishers p
JOIN top.publisher_bias pb
ON pb.publisher_id = p.id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
df = db.query("""
SELECT
*
FROM top.link_edges
WHERE parent_id in (
select
publisher_id
from bias
)
AND child_id in (
select
publisher_id
from bias
)
""").df()
df = DB.query("""
SELECT
*
FROM top.link_edges
WHERE parent_id in (
select
publisher_id
from bias
)
AND child_id in (
select
publisher_id
from bias
)
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
x = pivot.values
@ -166,9 +162,9 @@ def link_confusion():
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
plt.savefig(out_dir / filename)
plt.savefig(save_to)
plt.close()
print(f"saved plot: {filename}")
print(f"saved plot: {save_to}")
@click.command('plot:link-classifier')
def link_confusion():
@ -176,49 +172,51 @@ def link_confusion():
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay
filename = "link_confusion.png"
save_to = paths('figures') / "link_confusion.png"
DB = connect()
bias = DB.query("""
SELECT
p.id as publisher_id
,b.ordinal
FROM top.publishers p
JOIN top.publisher_bias pb
ON pb.publisher_id = p.id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
with connect() as db:
bias = db.query("""
SELECT
p.id as publisher_id
,b.ordinal
FROM top.publishers p
JOIN top.publisher_bias pb
ON pb.publisher_id = p.id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
df = db.query("""
SELECT
*
FROM top.link_edges
WHERE parent_id in (
select
publisher_id
from bias
)
AND child_id in (
select
publisher_id
from bias
)
""").df()
df = DB.query("""
SELECT
*
FROM top.link_edges
WHERE parent_id in (
select
publisher_id
from bias
)
AND child_id in (
select
publisher_id
from bias
)
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
x = pivot.values
y = bias.sort_values('publisher_id').ordinal
data = DB.query(f"""
SELECT
p.id as publisher_id
,pca.first
,pca.second
FROM top.publisher_pca_onehot pca
JOIN top.publishers p
ON pca.publisher_id = p.id
""").df()
with connect() as db:
data = db.query(f"""
SELECT
p.id as publisher_id
,pca.first
,pca.second
FROM top.publisher_pca_onehot pca
JOIN top.publishers p
ON pca.publisher_id = p.id
""").df()
@ -235,11 +233,11 @@ def link_confusion():
ConfusionMatrixDisplay.from_predictions(data['ordinal'], data['pred'], ax=ax)
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="confusion matrix for link matrix kNN classifier", xticklabels=ticklabels, yticklabels=ticklabels)
plt.savefig(out_dir / filename)
plt.savefig(save_to)
plt.close()
print(f"saved plot: {filename}")
print(f"saved plot: {save_to}")
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
plt.savefig(out_dir / filename)
plt.close()
print(f"saved plot: {filename}")
# ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
# plt.savefig(out_dir / filename)
# plt.close()
# print(f"saved plot: {filename}")

View File

@ -1,5 +1,5 @@
import click
from data.main import connect
from data.main import connect, paths
import os
from pathlib import Path
import seaborn as sns
@ -7,57 +7,52 @@ import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
@click.command('plot:sentence-pca')
def sentence_pca():
filename = "embedding_sentence_pca.png"
DB = connect()
save_to = paths('figures') / "embedding_sentence_pca.png"
data = DB.query("""
SELECT
pca.first
,pca.second
,b.bias as label
FROM top.story_embeddings_pca pca
JOIN top.stories s
ON s.id = pca.story_id
JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
DB.close()
with connect() as db:
data = db.query("""
SELECT
pca.first
,pca.second
,b.bias as label
FROM top.story_embeddings_pca pca
JOIN top.stories s
ON s.id = pca.story_id
JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
ax.set(title="pca components vs. bias label", xlabel="first component", ylabel="second component")
plt.savefig(out_path / filename)
plt.savefig(save_to)
@click.command('plot:avg-sentence-pca')
def avg_sentence_pca():
filename = "avg_embedding_sentence_pca.png"
DB = connect()
save_to = paths('figures') / "avg_embedding_sentence_pca.png"
data = DB.query("""
SELECT
pca.first
,pca.second
,p.tld
,b.bias as label
FROM top.publisher_embeddings_pca pca
JOIN top.publishers p
ON p.id = pca.publisher_id
JOIN top.publisher_bias pb
ON pb.publisher_id = p.id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
DB.close()
with connect() as db:
data = db.query("""
SELECT
pca.first
,pca.second
,p.tld
,b.bias as label
FROM top.publisher_embeddings_pca pca
JOIN top.publishers p
ON p.id = pca.publisher_id
JOIN top.publisher_bias pb
ON pb.publisher_id = p.id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
ax.set(title="avg. publisher embedding pca components vs. bias label", xlabel="first component", ylabel="second component")
plt.savefig(out_path / filename)
plt.savefig(save_to)
@click.command('plot:sentence-confusion')
def sentence_confusion():
@ -65,32 +60,31 @@ def sentence_confusion():
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay
filename = "sentence_confusion.png"
save_to = paths('figures') / "sentence_confusion.png"
embeddings = np.load(data_path / 'embeddings.npy')
embedding_ids = np.load(data_path / 'embedding_ids.npy')
embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
DB = connect()
data = DB.query("""
SELECT
ids.index
,s.id
,b.ordinal
FROM ids
JOIN top.stories s
ON ids.story_id = s.id
JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
pub = DB.query("""
SELECT
*
FROM top.publishers
""").df()
DB.close()
with connect() as db:
data = db.query("""
SELECT
ids.index
,s.id
,b.ordinal
FROM ids
JOIN top.stories s
ON ids.story_id = s.id
JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
pub = db.query("""
SELECT
*
FROM top.publishers
""").df()
train, test = train_test_split(data)
train_x, train_y = embeddings[train['index']], train['ordinal']
@ -105,7 +99,7 @@ def sentence_confusion():
ConfusionMatrixDisplay.from_predictions(test_y, pred, ax=ax)
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
plt.savefig(out_path / filename)
plt.savefig(save_to)
plt.close()
print(f"saved plot: {filename}")
print(f"saved plot: {save_to}")

View File

@ -1,138 +1,135 @@
import click
from data.main import connect
import os
from pathlib import Path
from data.main import connect, paths, ticklabels
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:sentiment-over-time')
def over_time():
filename = "sentiment_over_time.png"
DB = connect()
data = DB.sql("""
SELECT
avg(sent.class_id) as sentiment
,s.published_at as date
FROM top.story_sentiments sent
JOIN top.stories s
ON s.id = sent.story_id
GROUP BY
s.published_at
""").df()
DB.close()
filename = "sentiment_over_time.png"
save_to = paths('figures') / filename
with connect() as db:
data = db.sql("""
SELECT
avg(sent.class_id) as sentiment
,s.published_at as date
FROM top.story_sentiments sent
JOIN top.stories s
ON s.id = sent.story_id
GROUP BY
s.published_at
""").df()
ax = sns.scatterplot(x=data['date'], y=data['sentiment'])
ax.set(title="sentiment vs. time")
plt.tight_layout()
plt.savefig(out_path / filename)
print(f"saved: {filename}")
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
@click.command('plot:bias-vs-sentiment-over-time')
def bias_over_time():
"""plot sentiment/bias vs. time"""
filename = "bias_vs_sentiment_over_time.png"
save_to = paths('figures') / filename
DB = connect()
data = DB.sql("""
SELECT
avg(sent.class_id) as sentiment
,date_trunc('yearweek', s.published_at) as date
--,b.ordinal as ordinal
,b.bias
FROM top.story_sentiments sent
JOIN top.stories s
ON s.id = sent.story_id
JOIN publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
GROUP BY
date_trunc('yearweek', s.published_at)
,b.bias
""").df()
DB.close()
with connect() as db:
data = db.sql("""
with cte as (
SELECT
avg(sent.class_id) as sentiment
,date_trunc('yearweek', s.published_at) as date
,p.bias
FROM story_sentiments sent
JOIN stories s
ON s.id = sent.story_id
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
WHERE p.ordinal != -1
GROUP BY
date_trunc('yearweek', s.published_at)
,p.bias
)
SELECT
median(sentiment) OVER (PARTITION BY bias ORDER BY date DESC ROWS BETWEEN 0 PRECEDING AND 7 FOLLOWING) as sentiment
,date
,bias
FROM cte
WHERE year(date) not in (2005, 2023)
""").df()
order = ['left', 'left-center', 'center', 'right-center', 'right']
ax = sns.relplot(data, x='date', y='sentiment', col='bias', col_order=order)
#ax = sns.relplot(data, x='date', y='sentiment', col='bias', palette='rainbow', hue='bias', col_order=ticklabels())
ax = sns.lineplot(data, x='date', y='sentiment', palette='rainbow', hue='bias', hue_order=ticklabels())
plt.axhline(y=0.5, color='black', linestyle='--', label='neutral')
ax.set(title='sentiment and bias vs. time', ylabel='8 week rolling avg. sentiment', xlabel='date')
plt.tight_layout()
plt.savefig(out_path / filename)
plt.savefig(save_to)
plt.close()
print(f"saved: {filename}")
print(f"saved: {save_to}")
@click.command('plot:sentiment-recent-winner')
def bias_vs_recent_winner():
"""plot bias vs. distance to election"""
filename = "bias_vs_recent_winner.png"
save_to = paths('figures') / filename
DB = connect()
data = DB.sql("""
SELECT
e.days_away as days_away
,b.ordinal
,avg(sent.class_id) as sentiment
,count(1) as stories
FROM top.stories s
JOIN top.story_sentiments sent
ON s.id = sent.story_id
JOIN election_distance e
ON e.publish_date = s.published_at
JOIN publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
GROUP BY
e.days_away
,b.ordinal
""").df()
DB.close()
data
with connect() as db:
data = db.sql("""
SELECT
round(e.days_away, -1) as days_away
,p.bias
,avg(sent.class_id) as sentiment
,count(1) as stories
FROM stories s
JOIN story_sentiments sent
ON s.id = sent.story_id
JOIN election_distance e
ON e.publish_date = s.published_at
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
GROUP BY
round(e.days_away, -1)
,p.bias
""").df()
ax = sns.scatterplot(x=data['days_away'], y=data['sentiment'], hue=data['ordinal'])
ax = sns.scatterplot(data, x='days_away', y='sentiment', hue='bias', hue_order=ticklabels(), palette='rainbow')
ax.set(title="sentiment vs. days to nearest election", xlabel="days to nearest election", ylabel="average title seniment")
plt.tight_layout()
plt.savefig(out_path / filename)
plt.savefig(save_to)
plt.close()
print(f"saved: {filename}")
print(f"saved: {save_to}")
@click.command('plot:sentiment-hist')
def sentiment_hist():
filename = "sentiment_hist.png"
save_to = paths('figures') / filename
DB = connect()
with connect() as db:
data = db.sql("""
SELECT
p.bias
,count(1) as stories
FROM stories s
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
WHERE p.ordinal != -1
GROUP BY
p.bias
""").df()
DB.query("""
select
sent.label
,count(distinct s.id) as stories
,count(distinct s.publisher_id) as publishers
from top.story_sentiments sent
join top.stories s
on s.id = sent.story_id
group by
sent.label
""").df().to_markdown(index=False)
data = DB.sql("""
SELECT
b.ordinal
,count(1) as stories
FROM stories s
JOIN publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
GROUP BY
b.ordinal
""").df()
DB.close()
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
ax = sns.barplot(data, x='bias', y='stories', hue='bias', palette='rainbow')
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
plt.tight_layout()
plt.savefig(out_path / filename)
print(f"saved: {filename}")
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")

View File

@ -1,48 +0,0 @@
from data.main import connect
import pandas as pd
import numpy as np
DB = connect()
edges = DB.query("""
select
*
from link_edges
""").df()
DB.close()
edges
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
DB = connect()
DB.query("create schema top")
DB.query("""
CREATE OR REPLACE TABLE top.publishers AS
SELECT
p.*
FROM publishers p
JOIN select_publishers s
ON s.publisher_id = p.id
""")
DB.query("""
CREATE OR REPLACE TABLE top.stories AS
SELECT
s.*
FROM stories s
JOIN top.publishers p
ON s.publisher_id = p.id
WHERE year(s.published_at) >= 2006
AND year(s.published_at) < 2023
""")
DB.query("""
CREATE OR REPLACE TABLE top.related_stories AS
SELECT
r.*
FROM top.stories s
JOIN related_stories r
ON s.id = r.parent_id
""")

View File

@ -1,7 +1,7 @@
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from data.main import connect, data_dir
from data.main import connect, paths
import os
from pathlib import Path
import numpy as np
@ -62,7 +62,7 @@ def embed(chunks):
ids = np.concatenate(embedding_ids)
# save embeddings
save_to = data_dir() / 'embeddings.npy'
save_to = paths('data') / 'embeddings.npy'
np.save(save_to, embeddings)
print(f"embeddings saved: {save_to}")
@ -75,29 +75,28 @@ def embed(chunks):
@click.command('sentence:create-avg-pca-table')
def create_avg_pca_table():
from sklearn.decomposition import PCA
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
embeddings = np.load(data_path / 'embeddings.npy')
embedding_ids = np.load(data_path / 'embedding_ids.npy')
embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
DB = connect()
data = DB.query("""
SELECT
ids.index
,s.id
,s.publisher_id
,b.ordinal
FROM ids
JOIN top.stories s
ON ids.story_id = s.id
JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
DB.close()
with connect() as db:
data = db.query("""
SELECT
ids.index
,s.id
,s.publisher_id
,b.ordinal
FROM ids
JOIN top.stories s
ON ids.story_id = s.id
JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
results = []
for publisher_id, group in data.groupby(['publisher_id']):
@ -115,47 +114,45 @@ def create_avg_pca_table():
results['second'] = pred[:, 1]
table_name = "top.publisher_embeddings_pca"
DB = connect()
DB.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
results.publisher_id as publisher_id
,results.first as first
,results.second as second
FROM results
""")
DB.close()
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
results.publisher_id as publisher_id
,results.first as first
,results.second as second
FROM results
""")
print(f"created {table_name}")
@click.command('sentence:create-pca-table')
def create_pca_table():
from sklearn.decomposition import PCA
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
embeddings = np.load(data_path / 'embeddings.npy')
embedding_ids = np.load(data_path / 'embedding_ids.npy')
embeddings = np.load(path('data') / 'embeddings.npy')
embedding_ids = np.load(path('data') / 'embedding_ids.npy')
DB = connect()
data = DB.query("""
SELECT
ids.index
,s.id
,b.ordinal
FROM ids
JOIN top.stories s
ON ids.story_id = s.id
JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
pub = DB.query("""
SELECT
*
FROM top.publishers
""").df()
DB.close()
with connect() as db:
data = db.query("""
SELECT
ids.index
,s.id
,b.ordinal
FROM ids
JOIN top.stories s
ON ids.story_id = s.id
JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
pub = db.query("""
SELECT
*
FROM top.publishers
""").df()
x = embeddings[data['index']]
y = data['ordinal'].to_numpy().reshape(-1, 1)
@ -166,42 +163,41 @@ def create_pca_table():
table_name = f"top.story_embeddings_pca"
DB = connect()
DB.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
data.id as story_id
,data.first as first
,data.second as second
FROM data
""")
DB.close()
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
data.id as story_id
,data.first as first
,data.second as second
FROM data
""")
print(f"created {table_name}")
@click.command('sentence:create-svm-table')
def create_svm_table():
from sklearn import svm
from sklearn.linear_model import SGDClassifier
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
embeddings = np.load(data_path / 'embeddings.npy')
embedding_ids = np.load(data_path / 'embedding_ids.npy')
embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
DB = connect()
data = DB.query("""
SELECT
ids.index
,s.id
,b.ordinal
FROM ids
JOIN top.stories s
ON ids.story_id = s.id
JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
with connect() as db:
data = db.query("""
SELECT
ids.index
,s.id
,b.ordinal
FROM ids
JOIN top.stories s
ON ids.story_id = s.id
JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
x = embeddings[data['index']]
#y = data['ordinal'].to_numpy().reshape(-1, 1)