add mbfc data. use context manager for db. add paths fn.
Before Width: | Height: | Size: 21 KiB After Width: | Height: | Size: 22 KiB |
After Width: | Height: | Size: 148 KiB |
Before Width: | Height: | Size: 16 KiB After Width: | Height: | Size: 19 KiB |
Before Width: | Height: | Size: 235 KiB After Width: | Height: | Size: 73 KiB |
Before Width: | Height: | Size: 202 KiB After Width: | Height: | Size: 104 KiB |
After Width: | Height: | Size: 22 KiB |
Before Width: | Height: | Size: 44 KiB After Width: | Height: | Size: 46 KiB |
Before Width: | Height: | Size: 30 KiB After Width: | Height: | Size: 36 KiB |
After Width: | Height: | Size: 40 KiB |
After Width: | Height: | Size: 87 KiB |
After Width: | Height: | Size: 66 KiB |
After Width: | Height: | Size: 40 KiB |
Before Width: | Height: | Size: 51 KiB After Width: | Height: | Size: 66 KiB |
After Width: | Height: | Size: 29 KiB |
Before Width: | Height: | Size: 128 KiB After Width: | Height: | Size: 128 KiB |
Before Width: | Height: | Size: 22 KiB After Width: | Height: | Size: 21 KiB |
Before Width: | Height: | Size: 54 KiB After Width: | Height: | Size: 53 KiB |
|
@ -0,0 +1,27 @@
|
|||
from efficient_apriori import apriori
|
||||
from data.main import connect
|
||||
|
||||
@click.command("apriori:rules")
|
||||
def rules():
|
||||
DB = connect()
|
||||
data = DB.query("""
|
||||
SELECT
|
||||
--list_prepend(parent.id, list(child.id)) as transaction
|
||||
list_prepend(parent.tld, list(child.tld)) as transaction
|
||||
FROM stories s
|
||||
JOIN related_stories r
|
||||
ON r.parent_id = s.id
|
||||
JOIN publishers parent
|
||||
ON parent.id = s.publisher_id
|
||||
JOIN publishers child
|
||||
ON child.id = r.publisher_id
|
||||
GROUP BY
|
||||
--parent.id
|
||||
parent.tld
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
transactions = data.transaction.apply(lambda x: tuple(x)).values
|
||||
|
||||
itemsets, rules = apriori(transactions, min_support=0.1, min_confidence=0.8)
|
||||
print(*rules, sep="\n")
|
73
src/bias.py
|
@ -1,41 +1,16 @@
|
|||
import click
|
||||
from data.main import connect
|
||||
from data.main import connect, paths
|
||||
import pandas as pd
|
||||
from lxml import etree
|
||||
from pathlib import Path
|
||||
import os
|
||||
import csv
|
||||
|
||||
def label_to_int(rating:str) -> int:
|
||||
|
||||
mapping = {
|
||||
'left' : 0,
|
||||
'left-center' : 1,
|
||||
'center' : 2,
|
||||
'right-center' : 3,
|
||||
'right' : 4,
|
||||
'allsides' : -1,
|
||||
}
|
||||
|
||||
return mapping[rating]
|
||||
|
||||
def int_to_label(class_id: int) -> str:
|
||||
mapping = {
|
||||
0 : 'left',
|
||||
1 : 'left-center',
|
||||
2 : 'center',
|
||||
3 : 'right-center',
|
||||
4 : 'right',
|
||||
-1 : 'allsides',
|
||||
}
|
||||
return mapping[class_id]
|
||||
|
||||
|
||||
@click.command(name="bias:normalize")
|
||||
def normalize() -> None:
|
||||
DB = connect()
|
||||
|
||||
DB.sql("""
|
||||
with connect() as db:
|
||||
db.sql("""
|
||||
CREATE OR REPLACE TABLE publisher_bias AS
|
||||
WITH cte AS (
|
||||
SELECT
|
||||
|
@ -72,9 +47,9 @@ def normalize() -> None:
|
|||
]
|
||||
mapping = pd.DataFrame(mapping)
|
||||
|
||||
DB.query("alter table bias_ratings add column ordinal int")
|
||||
|
||||
DB.query("""
|
||||
with connect() as db:
|
||||
db.query("alter table bias_ratings add column ordinal int")
|
||||
db.query("""
|
||||
update bias_ratings b
|
||||
set ordinal = o.ordinal
|
||||
FROM mapping o
|
||||
|
@ -85,9 +60,7 @@ def normalize() -> None:
|
|||
@click.command(name='bias:parse')
|
||||
def parse() -> None:
|
||||
"""parse the save html page of allslides.com bias ratings into a normalized csv file"""
|
||||
DB = connect()
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
bias_html = DATA_DIR / 'allsides.html'
|
||||
bias_html = paths('data') / 'allsides.html'
|
||||
|
||||
parser = etree.HTMLParser()
|
||||
tree = etree.parse(str(bias_html), parser)
|
||||
|
@ -111,15 +84,14 @@ def parse() -> None:
|
|||
rating['disagree'] = int(disagree)
|
||||
ratings.append(rating)
|
||||
df = pd.DataFrame(ratings)
|
||||
df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
|
||||
df.to_csv(paths('data') / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
|
||||
|
||||
@click.command(name="bias:load")
|
||||
def load() -> None:
|
||||
DB = connect()
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
f = str(DATA_DIR / "bias_ratings.csv")
|
||||
f = str(paths('data') / "bias_ratings.csv")
|
||||
|
||||
DB.sql(f"""
|
||||
with connect() as db:
|
||||
db.sql(f"""
|
||||
CREATE TABLE bias_ratings as
|
||||
select
|
||||
row_number() over(order by b.publisher) as id
|
||||
|
@ -129,10 +101,8 @@ def load() -> None:
|
|||
|
||||
@click.command('bias:export')
|
||||
def export():
|
||||
data_path = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
|
||||
DB = connect()
|
||||
all_bias = DB.query("""
|
||||
with connect() as db:
|
||||
all_bias = db.query("""
|
||||
SELECT
|
||||
id as bias_id
|
||||
,publisher as name
|
||||
|
@ -140,8 +110,10 @@ def export():
|
|||
FROM bias_ratings
|
||||
ORDER by agree desc
|
||||
""")
|
||||
all_bias.df().to_csv(data_path / 'TMP_publisher_bias.csv', sep="|", index=False)
|
||||
mapped_bias = DB.query("""
|
||||
|
||||
all_bias.df().to_csv(paths('data') / 'TMP_publisher_bias.csv', sep="|", index=False)
|
||||
with connect() as db:
|
||||
mapped_bias = db.query("""
|
||||
SELECT
|
||||
p.id as publisher_id
|
||||
,p.name as name
|
||||
|
@ -152,18 +124,16 @@ def export():
|
|||
LEFT JOIN publisher_bias b
|
||||
ON b.publisher_id = p.id
|
||||
""")
|
||||
mapped_bias.df().to_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|", index=False)
|
||||
DB.close()
|
||||
mapped_bias.df().to_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|", index=False)
|
||||
|
||||
@click.command('bias:import-mapped')
|
||||
def import_mapped():
|
||||
data_path = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
table_name = "top.publisher_bias"
|
||||
|
||||
DB = connect()
|
||||
df = pd.read_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|")
|
||||
df = pd.read_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|")
|
||||
|
||||
DB.query(f"""
|
||||
with connect() as db:
|
||||
db.query(f"""
|
||||
CREATE OR REPLACE TABLE {table_name} AS
|
||||
SELECT
|
||||
publisher_id AS publisher_id
|
||||
|
@ -171,5 +141,6 @@ def import_mapped():
|
|||
FROM df
|
||||
WHERE bias_id IS NOT NULL
|
||||
""")
|
||||
|
||||
print(f"created table: {table_name}")
|
||||
|
||||
|
|
54
src/cli.py
|
@ -1,5 +1,7 @@
|
|||
import click
|
||||
from dotenv import load_dotenv
|
||||
import data
|
||||
import plots
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
|
@ -7,12 +9,20 @@ def cli():
|
|||
|
||||
if __name__ == "__main__":
|
||||
load_dotenv()
|
||||
from data import scrape
|
||||
cli.add_command(scrape.download)
|
||||
cli.add_command(scrape.parse)
|
||||
cli.add_command(scrape.load)
|
||||
cli.add_command(scrape.normalize)
|
||||
cli.add_command(scrape.create_elections_table)
|
||||
|
||||
# original bias ratings
|
||||
cli.add_command(data.scrape.download)
|
||||
cli.add_command(data.scrape.parse)
|
||||
cli.add_command(data.scrape.load)
|
||||
cli.add_command(data.scrape.normalize)
|
||||
cli.add_command(data.scrape.create_elections_table)
|
||||
|
||||
cli.add_command(data.factcheck.parse_index)
|
||||
cli.add_command(data.factcheck.scrape)
|
||||
|
||||
cli.add_command(data.links.create_table)
|
||||
cli.add_command(data.links.create_pca)
|
||||
cli.add_command(data.links.create_clusters)
|
||||
|
||||
import word
|
||||
# cli.add_command(word.distance)
|
||||
|
@ -23,10 +33,12 @@ if __name__ == "__main__":
|
|||
cli.add_command(bias.parse)
|
||||
cli.add_command(bias.load)
|
||||
cli.add_command(bias.normalize)
|
||||
|
||||
import mine
|
||||
cli.add_command(mine.embeddings)
|
||||
cli.add_command(mine.cluster)
|
||||
cli.add_command(mine.plot)
|
||||
|
||||
import emotion
|
||||
cli.add_command(emotion.extract)
|
||||
cli.add_command(emotion.normalize)
|
||||
|
@ -40,34 +52,20 @@ if __name__ == "__main__":
|
|||
from train import main as train_main
|
||||
cli.add_command(train_main.main)
|
||||
|
||||
import plots.descriptive as plotd
|
||||
cli.add_command(plotd.articles_per_year)
|
||||
cli.add_command(plotd.distinct_publishers)
|
||||
cli.add_command(plotd.stories_per_publisher)
|
||||
cli.add_command(plotd.top_publishers)
|
||||
cli.add_command(plotd.common_tld)
|
||||
|
||||
import links as linkcli
|
||||
cli.add_command(linkcli.create_table)
|
||||
cli.add_command(linkcli.create_pca)
|
||||
cli.add_command(linkcli.create_clusters)
|
||||
|
||||
import plots.links as plotl
|
||||
cli.add_command(plotl.elbow)
|
||||
cli.add_command(plotl.link_pca_clusters)
|
||||
|
||||
import plots.classifier as plotc
|
||||
cli.add_command(plotc.pca_with_classes)
|
||||
|
||||
import plots
|
||||
cli.add_command(plots.descriptive.articles_per_year)
|
||||
cli.add_command(plots.descriptive.distinct_publishers)
|
||||
cli.add_command(plots.descriptive.stories_per_publisher)
|
||||
cli.add_command(plots.descriptive.top_publishers)
|
||||
cli.add_command(plots.descriptive.common_tld)
|
||||
cli.add_command(plots.sentence.sentence_pca)
|
||||
cli.add_command(plots.sentence.avg_sentence_pca)
|
||||
cli.add_command(plots.emotion.emotion_over_time)
|
||||
cli.add_command(plots.emotion.emotion_regression)
|
||||
|
||||
cli.add_command(plots.sentiment.over_time)
|
||||
cli.add_command(plots.sentiment.bias_over_time)
|
||||
cli.add_command(plots.sentiment.bias_vs_recent_winner)
|
||||
|
||||
cli.add_command(plots.links.elbow)
|
||||
cli.add_command(plots.links.link_pca_clusters)
|
||||
cli.add_command(plots.classifier.pca_with_classes)
|
||||
|
||||
cli()
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
import data.main
|
||||
import data.scrape
|
||||
import data.factcheck
|
||||
import data.links
|
||||
__all__ = [
|
||||
'main'
|
||||
,'scrape'
|
||||
,'factcheck'
|
||||
,'links'
|
||||
]
|
||||
|
|
|
@ -0,0 +1,171 @@
|
|||
import requests
|
||||
from lxml import etree
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
from io import BytesIO
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
import os
|
||||
import sys
|
||||
import click
|
||||
from data.main import connect, map_tld, paths
|
||||
from random import randint
|
||||
from time import sleep
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
@click.command('mbfc:parse-index')
|
||||
def parse_index():
|
||||
parser = etree.HTMLParser()
|
||||
publishers = []
|
||||
for page in range(1, 54):
|
||||
url = f"https://mediabiasfactcheck.com/filtered-search/?pg={page}"
|
||||
print(f"downloading {url}", file=sys.stderr)
|
||||
response = requests.get(url)
|
||||
html = response.content
|
||||
tree = etree.parse(BytesIO(html), parser)
|
||||
rows = tree.xpath('//table[@class="mbfc-table"]/tbody/tr')
|
||||
print(f"parsing {len(rows)} rows", file=sys.stderr)
|
||||
for row in rows:
|
||||
publisher = {}
|
||||
link, bias, reporting, country, credibility, media_type, traffic, popularity = tuple(col for col in row.iterchildren())
|
||||
link = link.xpath('./a')[0]
|
||||
publisher['name'] = link.text
|
||||
publisher['detail_url'] = link.get('href')
|
||||
publisher['bias'] = bias.text
|
||||
publisher['reporting'] = reporting.text
|
||||
publisher['country'] = country.text
|
||||
publisher['credibility'] = credibility.text
|
||||
publisher['media_type'] = media_type.text
|
||||
publisher['traffic'] = traffic.text
|
||||
publisher['popularity'] = popularity.xpath('./span')[0].text
|
||||
publishers.append(publisher)
|
||||
df = pd.DataFrame(publishers)
|
||||
save_to = paths('data') / 'mbfc_bias.csv'
|
||||
df.to_csv(save_to, sep='|', index=False)
|
||||
print(f"saved {len(df)}: {save_to}", file=sys.stderr)
|
||||
|
||||
@click.command("mbfc:schema")
|
||||
def schema():
|
||||
with connect() as db:
|
||||
db.sql("""create schema mbfc""")
|
||||
db.sql("""create or replace table mbfc.scrape (
|
||||
url text
|
||||
,scraped_at datetime default now()
|
||||
)
|
||||
""")
|
||||
|
||||
@click.command("mbfc:scrape")
|
||||
def scrape():
|
||||
|
||||
df = pd.read_csv(paths('data') / 'mbfc_bias.csv', sep="|")
|
||||
|
||||
with connect() as db:
|
||||
stats = db.query("""
|
||||
select
|
||||
count(1) filter(where s.url is not null) as elapsed
|
||||
,count(1) filter(where s.url is null) as remaining
|
||||
from df
|
||||
left join mbfc.scrape s
|
||||
on df.detail_url = s.url
|
||||
""").fetchall()
|
||||
df = db.query("""
|
||||
select
|
||||
detail_url as url
|
||||
from df
|
||||
where df.detail_url not in (
|
||||
select
|
||||
url
|
||||
from mbfc.scrape
|
||||
)
|
||||
""").df()
|
||||
print(f"{stats[0][0]} elapsed. {stats[0][1]} remaining.")
|
||||
|
||||
for url in df.url:
|
||||
delay = randint(1,3)
|
||||
save_as = paths('data') / 'mbfc' / (url.strip('/').split('/')[-1] + '.html')
|
||||
print(f"downloading (delay: {delay}): {url}", file=sys.stderr)
|
||||
sleep(delay)
|
||||
try:
|
||||
response = requests.get(url)
|
||||
except Exception as e:
|
||||
print(f"request failed: {url}", file=sys.stderr)
|
||||
continue
|
||||
with open(save_as, 'w') as f:
|
||||
f.write(response.text)
|
||||
with connect() as db:
|
||||
db.execute("""insert into mbfc.scrape (url) values (?)""", [url])
|
||||
print(f"saved: {save_as}", file=sys.stderr)
|
||||
|
||||
def load():
|
||||
|
||||
publishers = []
|
||||
for i, page in enumerate(tqdm((paths('data') / 'mbfc').iterdir())):
|
||||
publisher = {}
|
||||
publisher['origin_url'] = f"https://mediabiasfactcheck.com/{page.stem}"
|
||||
with page.open() as p:
|
||||
tree = BeautifulSoup(p, 'html.parser')
|
||||
for e in tree(string=re.compile(r'source:', re.IGNORECASE)):
|
||||
e = e.parent
|
||||
while e.name != 'p':
|
||||
e = e.parent
|
||||
l = e.find('a')
|
||||
if l:
|
||||
publisher['tld'] = l.get('href')
|
||||
break
|
||||
else:
|
||||
breakpoint()
|
||||
publishers.append(publisher)
|
||||
df = pd.DataFrame(publishers)
|
||||
df.to_csv(paths('data') / 'mbfc_publisher_url.csv', index=False, sep="|")
|
||||
|
||||
@click.command('mbfc:create-tables')
|
||||
def create_tables():
|
||||
|
||||
pubs = pd.read_csv(paths('data') / 'mbfc_publishers.csv', sep='|')
|
||||
urls = pd.read_csv(paths('data') / 'mbfc_publisher_url.csv', sep="|")
|
||||
df = pubs.merge(urls, on='mbfc_url')
|
||||
df['tld'] = df.tld.apply(map_tld)
|
||||
df['ordinal'] = df.bias.apply(bias_label_to_int)
|
||||
|
||||
with connect() as db:
|
||||
db.sql("""
|
||||
CREATE OR REPLACE TABLE mbfc.publishers AS
|
||||
SELECT
|
||||
row_number() over() as id
|
||||
,p.tld
|
||||
,mode(p.name) as name
|
||||
,mode(p.bias) as bias
|
||||
,mode(p.ordinal) as ordinal
|
||||
,mode(p.reporting) as reporting
|
||||
,mode(p.country) as country
|
||||
,mode(p.credibility) as credibility
|
||||
,mode(p.media_type) as media_type
|
||||
,mode(p.traffic) as traffic
|
||||
,mode(p.popularity) as popularity
|
||||
FROM df p
|
||||
GROUP BY
|
||||
p.tld
|
||||
""")
|
||||
|
||||
with connect() as db:
|
||||
raw_stories = db.sql("""
|
||||
SELECT
|
||||
*
|
||||
FROM stories s
|
||||
""").df()
|
||||
|
||||
stories['tld'] = stories.url.apply(map_tld)
|
||||
|
||||
with connect() as db:
|
||||
db.sql("""
|
||||
CREATE OR REPLACE TABLE mbfc.publisher_stories AS
|
||||
SELECT
|
||||
s.id as story_id
|
||||
,p.id as publisher_id
|
||||
FROM raw_stories s
|
||||
JOIN mbfc.publishers p
|
||||
ON p.tld = s.tld
|
||||
""")
|
||||
|
||||
|
|
@ -0,0 +1,135 @@
|
|||
import click
|
||||
from data.main import connect
|
||||
import pandas as pd
|
||||
|
||||
@click.command('links:create-table')
|
||||
def create_table():
|
||||
|
||||
with connect() as db:
|
||||
db.query(f"""
|
||||
CREATE OR REPLACE TABLE link_edges AS
|
||||
with cte as(
|
||||
SELECT
|
||||
s.publisher_id as parent_id
|
||||
,r.publisher_id as child_id
|
||||
,count(1) as links
|
||||
FROM stories s
|
||||
JOIN related_stories r
|
||||
ON s.id = r.parent_id
|
||||
group by
|
||||
s.publisher_id
|
||||
,r.publisher_id
|
||||
)
|
||||
SELECT
|
||||
cte.parent_id
|
||||
,cte.child_id
|
||||
,cte.links as links
|
||||
,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
|
||||
,case when cte.links > 0 then 1 else 0 end as onehot
|
||||
FROM cte
|
||||
WHERE cte.child_id in (
|
||||
SELECT
|
||||
distinct parent_id
|
||||
FROM cte
|
||||
)
|
||||
AND cte.parent_id in (
|
||||
SELECT
|
||||
distinct child_id
|
||||
FROM cte
|
||||
)
|
||||
""")
|
||||
|
||||
db.query("""
|
||||
SELECT
|
||||
*
|
||||
,count(1) over()
|
||||
FROM link_edges e
|
||||
limit 1
|
||||
""")
|
||||
|
||||
print(f"created link_edges")
|
||||
|
||||
@click.command('links:create-pca')
|
||||
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
||||
def create_pca(source):
|
||||
"""create 2D pca labels"""
|
||||
from sklearn.decomposition import PCA
|
||||
|
||||
table_name = f"publisher_pca_{source}"
|
||||
|
||||
with connect() as db:
|
||||
pub = db.query("""
|
||||
SELECT
|
||||
p.*
|
||||
FROM mbfc.publishers p
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON p.id = ps.publisher_id
|
||||
""").df()
|
||||
df = db.query(f"""
|
||||
SELECT
|
||||
parent_id
|
||||
,child_id
|
||||
,{source} as links
|
||||
FROM link_edges
|
||||
""").df()
|
||||
|
||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
svd = PCA(n_components=2)
|
||||
svd_out = svd.fit_transform(pivot)
|
||||
out = pivot.reset_index()[['parent_id']]
|
||||
out['first'] = svd_out[:, 0]
|
||||
out['second'] = svd_out[:, 1]
|
||||
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
|
||||
|
||||
with connect() as db:
|
||||
db.query(f"""
|
||||
CREATE OR REPLACE TABLE {table_name} AS
|
||||
SELECT
|
||||
out.id as publisher_id
|
||||
,out.first as first
|
||||
,out.second as second
|
||||
FROM out
|
||||
""")
|
||||
|
||||
print(f"created {table_name}")
|
||||
|
||||
|
||||
@click.command('links:create-clusters')
|
||||
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
||||
def create_clusters(source):
|
||||
"""create link adj. matrix clusters table"""
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
table_name = f"publisher_clusters_{source}"
|
||||
with connect() as db:
|
||||
df = db.query(f"""
|
||||
SELECT
|
||||
parent_id
|
||||
,child_id
|
||||
,{source} as links
|
||||
FROM link_edges
|
||||
""").df()
|
||||
pub = db.query("""
|
||||
SELECT
|
||||
p.*
|
||||
FROM mbfc.publishers p
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.publisher_id = p.id
|
||||
""").df()
|
||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
k = 8
|
||||
kmeans = KMeans(n_clusters=k, n_init="auto")
|
||||
pred = kmeans.fit_predict(pivot)
|
||||
out = pivot.reset_index()[['parent_id']]
|
||||
out['label'] = pred
|
||||
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
|
||||
new_table = out[['id', 'label']]
|
||||
with connect() as db:
|
||||
db.query(f"""
|
||||
CREATE OR REPLACE TABLE {table_name} AS
|
||||
SELECT
|
||||
n.id as publisher_id
|
||||
,n.label as label
|
||||
FROM new_table n
|
||||
""")
|
||||
print(f"created {table_name}")
|
|
@ -2,6 +2,10 @@ import os
|
|||
from pathlib import Path
|
||||
import duckdb
|
||||
from enum import Enum
|
||||
from urllib.parse import urlparse
|
||||
from tld import get_tld
|
||||
from tld.utils import update_tld_names
|
||||
import sys
|
||||
|
||||
class Data(str, Enum):
|
||||
Titles = 'titles'
|
||||
|
@ -9,6 +13,16 @@ class Data(str, Enum):
|
|||
def data_dir():
|
||||
return Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
|
||||
def paths(name='app'):
|
||||
if 'app' in name:
|
||||
return Path(os.environ['DATA_MINING_APP_DIR'])
|
||||
if 'data' in name:
|
||||
return Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
if 'doc' in name:
|
||||
return Path(os.environ['DATA_MINING_DOCS_DIR'])
|
||||
if 'figure' in name:
|
||||
return Path(os.environ['DATA_MINING_DOCS_DIR']) / 'figures'
|
||||
|
||||
def connect():
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
# APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
|
||||
|
@ -28,3 +42,66 @@ def from_db(t: Data):
|
|||
limit 100
|
||||
""").df()
|
||||
return table
|
||||
|
||||
def map_tld(x):
|
||||
try:
|
||||
res = get_tld(x, as_object=True)
|
||||
return res.fld
|
||||
except:
|
||||
print(f"'{x}' is not valid.", file=sys.stderr)
|
||||
return None
|
||||
|
||||
def ticklabels():
|
||||
return [
|
||||
'Left',
|
||||
'Left-Center',
|
||||
'Least Biased',
|
||||
'Right-Center',
|
||||
'Right',
|
||||
]
|
||||
|
||||
def bias_label_to_int(rating:str, source: str = 'mbfc') -> int:
|
||||
if source == 'mbfc':
|
||||
mapping = {
|
||||
'Left' : 0,
|
||||
'Left-Center' : 1,
|
||||
'Least Biased' : 2,
|
||||
'Right-Center' : 3,
|
||||
'Right' : 4,
|
||||
}
|
||||
else:
|
||||
mapping = {
|
||||
'left' : 0,
|
||||
'left-center' : 1,
|
||||
'center' : 2,
|
||||
'right-center' : 3,
|
||||
'right' : 4,
|
||||
}
|
||||
try:
|
||||
return mapping[rating]
|
||||
except:
|
||||
print(f"no mapping for {rating}", file=sys.stderr)
|
||||
return -1
|
||||
|
||||
def bias_int_to_label(class_id: int, source: str = 'mbfc') -> str:
|
||||
if source == 'mbfc':
|
||||
mapping = {
|
||||
0 : 'Left',
|
||||
1 : 'Left-Center',
|
||||
2 : 'Least Biased',
|
||||
3 : 'Right-Center',
|
||||
4 : 'Right',
|
||||
}
|
||||
else:
|
||||
mapping = {
|
||||
0 : 'left',
|
||||
1 : 'left-center',
|
||||
2 : 'center',
|
||||
3 : 'right-center',
|
||||
4 : 'right',
|
||||
}
|
||||
try:
|
||||
return mapping[class_id]
|
||||
except:
|
||||
print(f"no mapping for {class_id}", file=sys.stderr)
|
||||
return -1
|
||||
|
|
|
@ -319,12 +319,6 @@ def another_norm():
|
|||
""")
|
||||
|
||||
|
||||
def map_tld(x):
|
||||
try:
|
||||
res = get_tld(x, as_object=True)
|
||||
return res.fld
|
||||
except:
|
||||
return None
|
||||
|
||||
DB.sql("""
|
||||
SELECT
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
from data.main import connect
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
def create_tables():
|
||||
|
||||
with connect() as db:
|
||||
edges = db.query("""
|
||||
select
|
||||
*
|
||||
from link_edges
|
||||
""").df()
|
||||
|
||||
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
|
||||
|
||||
with connect() as db:
|
||||
db.query("create schema top")
|
||||
|
||||
db.query("""
|
||||
CREATE OR REPLACE TABLE top.publishers AS
|
||||
SELECT
|
||||
p.*
|
||||
FROM publishers p
|
||||
JOIN select_publishers s
|
||||
ON s.publisher_id = p.id
|
||||
""")
|
||||
|
||||
db.query("""
|
||||
CREATE OR REPLACE TABLE top.stories AS
|
||||
SELECT
|
||||
s.*
|
||||
FROM stories s
|
||||
JOIN top.publishers p
|
||||
ON s.publisher_id = p.id
|
||||
WHERE year(s.published_at) >= 2006
|
||||
AND year(s.published_at) < 2023
|
||||
""")
|
||||
|
||||
db.query("""
|
||||
CREATE OR REPLACE TABLE top.related_stories AS
|
||||
SELECT
|
||||
r.*
|
||||
FROM top.stories s
|
||||
JOIN related_stories r
|
||||
ON s.id = r.parent_id
|
||||
""")
|
|
@ -1,10 +1,11 @@
|
|||
import click
|
||||
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from data import connect, data_dir
|
||||
from data.main import connect, paths
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
import click
|
||||
import pandas as pd
|
||||
|
||||
@click.option('-c', '--chunks', type=int, default=500, show_default=True)
|
||||
@click.command("sentiment:extract")
|
||||
|
@ -67,20 +68,19 @@ def extract(chunks):
|
|||
@click.command('sentiment:load')
|
||||
def load():
|
||||
|
||||
DB = connect()
|
||||
sentiments = np.load(data_dir() / 'sentiment.npy')
|
||||
story_ids = np.load(data_dir() / 'sentiment_ids.npy')
|
||||
sentiments = np.load(paths('data') / 'sentiment.npy')
|
||||
story_ids = np.load(paths('data') / 'sentiment_ids.npy')
|
||||
data = pd.DataFrame(story_ids, columns=['story_id']).reset_index()
|
||||
data['sentiment_id'] = sentiments
|
||||
|
||||
DB.query("""
|
||||
CREATE OR REPLACE TABLE top.story_sentiments AS
|
||||
with connect() as db:
|
||||
db.query("""
|
||||
CREATE OR REPLACE TABLE story_sentiments AS
|
||||
SELECT
|
||||
data.story_id
|
||||
,data.sentiment_id as class_id
|
||||
,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label
|
||||
FROM data
|
||||
JOIN top.stories s
|
||||
JOIN stories s
|
||||
ON s.id = data.story_id
|
||||
""")
|
||||
DB.close()
|
255
src/links.py
|
@ -1,255 +0,0 @@
|
|||
import click
|
||||
from data.main import connect
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
@click.command('links:create-table')
|
||||
def create_table():
|
||||
|
||||
table_name = "top.link_edges"
|
||||
DB = connect()
|
||||
DB.query(f"""
|
||||
CREATE OR REPLACE TABLE {table_name} AS
|
||||
with cte as(
|
||||
SELECT
|
||||
s.publisher_id as parent_id
|
||||
,r.publisher_id as child_id
|
||||
,count(1) as links
|
||||
FROM top.stories s
|
||||
JOIN top.related_stories r
|
||||
ON s.id = r.parent_id
|
||||
group by
|
||||
s.publisher_id
|
||||
,r.publisher_id
|
||||
)
|
||||
SELECT
|
||||
cte.parent_id
|
||||
,cte.child_id
|
||||
,cte.links as links
|
||||
,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
|
||||
,case when cte.links > 0 then 1 else 0 end as onehot
|
||||
FROM cte
|
||||
WHERE cte.child_id in (
|
||||
SELECT
|
||||
distinct parent_id
|
||||
FROM cte
|
||||
)
|
||||
AND cte.parent_id in (
|
||||
SELECT
|
||||
distinct child_id
|
||||
FROM cte
|
||||
)
|
||||
""")
|
||||
DB.close()
|
||||
|
||||
DB = connect()
|
||||
DB.query("""
|
||||
SELECT
|
||||
*
|
||||
,-log10(links)
|
||||
--distinct parent_id
|
||||
FROM top.link_edges e
|
||||
WHERE e.parent_id = 238
|
||||
""")
|
||||
DB.close()
|
||||
print(f"created {table_name}")
|
||||
|
||||
@click.command('links:create-pca')
|
||||
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
||||
def create_pca(source):
|
||||
"""create 2D pca labels"""
|
||||
|
||||
from sklearn.decomposition import PCA
|
||||
|
||||
table_name = f"top.publisher_pca_{source}"
|
||||
DB = connect()
|
||||
pub = DB.query("""
|
||||
SELECT
|
||||
*
|
||||
FROM top.publishers
|
||||
""").df()
|
||||
df = DB.query(f"""
|
||||
SELECT
|
||||
parent_id
|
||||
,child_id
|
||||
,{source} as links
|
||||
FROM top.link_edges
|
||||
""").df()
|
||||
DB.close()
|
||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
|
||||
svd = PCA(n_components=2)
|
||||
svd_out = svd.fit_transform(pivot)
|
||||
|
||||
out = pivot.reset_index()[['parent_id']]
|
||||
out['first'] = svd_out[:, 0]
|
||||
out['second'] = svd_out[:, 1]
|
||||
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
|
||||
|
||||
DB = connect()
|
||||
DB.query(f"""
|
||||
CREATE OR REPLACE TABLE {table_name} AS
|
||||
SELECT
|
||||
out.id as publisher_id
|
||||
,out.first as first
|
||||
,out.second as second
|
||||
FROM out
|
||||
""")
|
||||
DB.close()
|
||||
print(f"created {table_name}")
|
||||
|
||||
|
||||
@click.command('links:create-clusters')
|
||||
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
||||
def create_clusters(source):
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
table_name = f"top.publisher_clusters_{source}"
|
||||
DB = connect()
|
||||
df = DB.query(f"""
|
||||
SELECT
|
||||
parent_id
|
||||
,child_id
|
||||
,{source} as links
|
||||
FROM top.link_edges
|
||||
""").df()
|
||||
pub = DB.query("""
|
||||
SELECT
|
||||
*
|
||||
FROM top.publishers
|
||||
""").df()
|
||||
DB.close()
|
||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
|
||||
|
||||
k = 8
|
||||
kmeans = KMeans(n_clusters=k, n_init="auto")
|
||||
pred = kmeans.fit_predict(pivot)
|
||||
out = pivot.reset_index()[['parent_id']]
|
||||
out['label'] = pred
|
||||
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
|
||||
new_table = out[['id', 'label']]
|
||||
|
||||
DB = connect()
|
||||
DB.query(f"""
|
||||
CREATE OR REPLACE TABLE {table_name} AS
|
||||
SELECT
|
||||
n.id as publisher_id
|
||||
,n.label as label
|
||||
FROM new_table n
|
||||
""")
|
||||
DB.close()
|
||||
print(f"created {table_name}")
|
||||
|
||||
def to_matrix():
|
||||
"""returns an adjacency matrix of publishers to publisher link frequency"""
|
||||
|
||||
DB = connect()
|
||||
|
||||
bias_map = pd.DataFrame([
|
||||
{'label' :'left', 'value' : 0},
|
||||
{'label' :'left-center', 'value' : 1},
|
||||
{'label' :'center', 'value' : 2},
|
||||
{'label' :'right-center', 'value' : 3},
|
||||
{'label' :'right', 'value' : 4},
|
||||
{'label' :'allsides', 'value' : -1},
|
||||
])
|
||||
|
||||
bias = DB.sql("""
|
||||
SELECT
|
||||
b.id
|
||||
,b.label
|
||||
,m.value
|
||||
FROM publisher_bias b
|
||||
JOIN bias_map m
|
||||
ON b.label = m.label
|
||||
WHERE value != -1
|
||||
""").df()
|
||||
|
||||
pub = DB.sql("""
|
||||
select
|
||||
p.id
|
||||
,p.name
|
||||
,p.url
|
||||
from publishers p
|
||||
""").df()
|
||||
|
||||
edges = DB.sql("""
|
||||
WITH total as (
|
||||
SELECT
|
||||
s.publisher_id as id
|
||||
,COUNT(1) as stories
|
||||
FROM stories s
|
||||
GROUP BY
|
||||
s.publisher_id
|
||||
), p as (
|
||||
SELECT
|
||||
p.id
|
||||
,stories
|
||||
FROM publishers p
|
||||
LEFT JOIN total t
|
||||
ON t.id = p.id
|
||||
WHERE t.stories >= 20
|
||||
), cte as (
|
||||
SELECT
|
||||
r.publisher_id as child_id
|
||||
,s.publisher_id as parent_id
|
||||
,count(1) as links
|
||||
FROM related_stories r
|
||||
JOIN stories s
|
||||
ON s.id = r.parent_id
|
||||
group by
|
||||
s.publisher_id
|
||||
,r.publisher_id
|
||||
)
|
||||
SELECT
|
||||
p.id as parent_id
|
||||
,cte.child_id
|
||||
,links
|
||||
FROM p
|
||||
left JOIN cte
|
||||
ON p.id = cte.parent_id
|
||||
""").df()
|
||||
|
||||
# only keep values that have more than 1 link
|
||||
test = edges[edges['links'] > 2].pivot(index='parent_id', columns='child_id', values='links').fillna(0).reset_index()
|
||||
edges.dropna().pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
pd.merge(adj, pub, how='left', left_on='parent_id', right_on='id')
|
||||
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
adj.values.shape
|
||||
|
||||
|
||||
out = pd.DataFrame(adj.index.values, columns=['id'])
|
||||
out = pd.merge(out, pub, how='left', on='id')
|
||||
return out
|
||||
|
||||
@click.command('links:analysis')
|
||||
def analysis():
|
||||
from sklearn.decomposition import PCA, TruncatedSVD
|
||||
from sklearn.cluster import MiniBatchKMeans
|
||||
adj = to_matrix()
|
||||
pca = PCA(n_components=4)
|
||||
pca_out = pca.fit_transform(adj)
|
||||
|
||||
svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
|
||||
svd_out = svd.fit_transform(adj)
|
||||
|
||||
x = svd_out[:, 0]
|
||||
y = svd_out[:, 1]
|
||||
|
||||
x = pca_out[:, 0]
|
||||
y = pca_out[:, 1]
|
||||
sns.scatterplot(x=x, y=y)
|
||||
plt.show()
|
||||
|
||||
kmeans = MiniBatchKMeans(n_clusters=5, random_state=0, batch_size=6, n_init="auto")
|
||||
pred = kmeans.fit_predict(pca_out)
|
||||
|
||||
sns.scatterplot(x=x, y=y, hue=pred)
|
||||
plt.show()
|
||||
|
||||
sns.scatterplot(x=pub['first'], y=pub['second'], hue=pub['bias'])
|
||||
plt.show()
|
|
@ -1,6 +1,5 @@
|
|||
from data.main import data_dir, connect
|
||||
from data.main import connect, paths
|
||||
import numpy as np
|
||||
import sklearn
|
||||
from sklearn.cluster import MiniBatchKMeans
|
||||
import click
|
||||
from pathlib import Path
|
||||
|
@ -11,7 +10,7 @@ from enum import Enum, auto
|
|||
|
||||
@click.command(name="mine:embeddings")
|
||||
def embeddings():
|
||||
data = np.load(data_dir() / "embeddings.npy")
|
||||
data = np.load(paths('data') / "embeddings.npy")
|
||||
kmeans = MiniBatchKMeans(n_clusters=5,
|
||||
random_state=0,
|
||||
batch_size=6,
|
||||
|
@ -76,7 +75,7 @@ class PlotName(str, Enum):
|
|||
@click.option('-n', '--name', required=True, type=click.Choice(PlotName))
|
||||
@click.option('-o', '--output', required=False, type=click.Path())
|
||||
def plot(name: PlotName, output: Path):
|
||||
output = output if output else APP_DIR / f'docs/{name}.png'
|
||||
output = output if output else paths('figures') / f'{name}.png'
|
||||
if name == PlotName.TitleLength:
|
||||
fig, ax = plt.subplots(1,1)
|
||||
data = db.sql("""
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
from data.main import connect, map_tld
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
def normalize():
|
||||
with connect() as db:
|
||||
db.sql("""
|
||||
SELECT
|
||||
p.name
|
||||
,count(1) as ctn
|
||||
,sum(ctn) over() as all
|
||||
FROM mbfc.publishers p
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.publisher_id = p.id
|
||||
JOIN stories s
|
||||
ON s.id = ps.story_id
|
||||
GROUP BY
|
||||
p.name
|
||||
""")
|
||||
|
||||
with connect() as db:
|
||||
db.sql("""
|
||||
SELECT
|
||||
bias
|
||||
,count(distinct p.id) as publishers
|
||||
,count(1) as stories
|
||||
,count(1) / count(distinct p.id) as ratio
|
||||
FROM mbfc.publishers p
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.publisher_id = p.id
|
||||
JOIN stories s
|
||||
ON s.id = ps.story_id
|
||||
GROUP BY
|
||||
p.bias
|
||||
ORDER BY count(1)
|
||||
""")
|
|
@ -1,9 +1,13 @@
|
|||
import plots.sentence
|
||||
import plots.emotion
|
||||
import plots.sentiment
|
||||
import plots.links
|
||||
import plots.classifier
|
||||
|
||||
__all__ = [
|
||||
'sentence'
|
||||
'emotion',
|
||||
'sentiment',
|
||||
'links',
|
||||
'classifier',
|
||||
]
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import click
|
||||
from data.main import connect
|
||||
from data.main import connect, bias_label_to_int, ticklabels
|
||||
import os
|
||||
from pathlib import Path
|
||||
import seaborn as sns
|
||||
|
@ -7,54 +7,53 @@ import matplotlib.pyplot as plt
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
|
||||
|
||||
@click.command('plot:bias-hist')
|
||||
def hist():
|
||||
filename = "bias_hist.png"
|
||||
save_to = paths('figures') / "bias_hist.png"
|
||||
|
||||
DB = connect()
|
||||
data = DB.sql("""
|
||||
with connect() as db:
|
||||
data = db.sql("""
|
||||
SELECT
|
||||
b.ordinal
|
||||
p.ordinal
|
||||
,count(1) as stories
|
||||
FROM stories s
|
||||
JOIN publisher_bias pb
|
||||
ON pb.publisher_id = s.publisher_id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON s.id = ps.story_id
|
||||
JOIN mbfc.publishers p
|
||||
ON ps.publisher_id = p.id
|
||||
WHERE ordinal != -1
|
||||
GROUP BY
|
||||
b.ordinal
|
||||
p.ordinal
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
|
||||
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
|
||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
|
||||
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_path / filename)
|
||||
print(f"saved: {filename}")
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
@click.command('plot:bias-publisher-hist')
|
||||
def publisher_hist():
|
||||
filename = "bias_publisher_hist.png"
|
||||
save_to = paths('figures') / "bias_publisher_hist.png"
|
||||
|
||||
DB = connect()
|
||||
data = DB.sql("""
|
||||
with connect() as db:
|
||||
data = db.sql("""
|
||||
SELECT
|
||||
b.ordinal
|
||||
,count(1) as publishers
|
||||
FROM publisher_bias pb
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
p.ordinal
|
||||
,count(distinct p.id) as publishers
|
||||
FROM mbfc.publishers p
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.publisher_id = p.id
|
||||
WHERE ordinal != -1
|
||||
GROUP BY
|
||||
b.ordinal
|
||||
p.ordinal
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.barplot(x=data['ordinal'], y=data['publishers'], color='tab:blue')
|
||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||
ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels)
|
||||
ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels())
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_path / filename)
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved: {filename}")
|
||||
print(f"saved: {save_to}")
|
||||
|
|
|
@ -5,30 +5,32 @@ import seaborn as sns
|
|||
import matplotlib.pyplot as plt
|
||||
from pathlib import Path
|
||||
|
||||
out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
|
||||
|
||||
@click.command('plot:pca-with-classes')
|
||||
def pca_with_classes():
|
||||
filename = "pca_with_classes.png"
|
||||
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
||||
def pca_with_classes(source):
|
||||
|
||||
DB = connect()
|
||||
data = DB.query(f"""
|
||||
save_to = paths('figures') / f"link_{source}_pca_with_classes.png"
|
||||
|
||||
with connect() as db:
|
||||
df = db.query(f"""
|
||||
SELECT
|
||||
p.tld
|
||||
,b.bias
|
||||
,p.bias
|
||||
,c.first
|
||||
,c.second
|
||||
,round(cast(b.agree as float) / (b.agree + b.disagree), 2) ratio
|
||||
FROM top.publishers p
|
||||
JOIN top.publisher_bias pb
|
||||
ON p.id = pb.publisher_id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
JOIN top.publisher_pca_normalized c
|
||||
--,round(cast(b.agree as float) / (b.agree + b.disagree), 2) ratio
|
||||
FROM mbfc.publishers p
|
||||
JOIN publisher_pca_{source} c
|
||||
ON c.publisher_id = p.id
|
||||
WHERE p.ordinal != -1
|
||||
ORDER BY p.ordinal
|
||||
""").df()
|
||||
DB.close()
|
||||
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['bias'], s=100)
|
||||
ax.set(title="pca components vs. bias labels", xlabel="first pca component", ylabel="second pca component")
|
||||
plt.savefig(out_dir / filename)
|
||||
print(f"saved: {filename}")
|
||||
|
||||
ax = sns.relplot(df, x='first', y='second', hue='bias', col='bias', s=100, palette='rainbow')
|
||||
ax.set(xlabel="first pca component",
|
||||
ylabel="second pca component")
|
||||
ax.figure.suptitle="pca components vs. bias labels"
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved: {save_to}")
|
||||
os.system(f'xdg-open {save_to}')
|
||||
|
|
|
@ -1,18 +1,16 @@
|
|||
import click
|
||||
from data.main import connect
|
||||
from data.main import connect, paths
|
||||
import os
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
|
||||
out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
|
||||
|
||||
@click.command('plot:articles-per-year')
|
||||
def articles_per_year():
|
||||
filename = 'articles_per_year.png'
|
||||
save_to = paths('figures') / 'articles_per_year.png'
|
||||
|
||||
DB = connect()
|
||||
with connect() as db:
|
||||
data = DB.query("""
|
||||
select
|
||||
year(published_at) as year
|
||||
|
@ -21,19 +19,19 @@ def articles_per_year():
|
|||
group by
|
||||
year(published_at)
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.barplot(x=data.year, y=data.stories, color='tab:blue')
|
||||
ax.tick_params(axis='x', rotation=90)
|
||||
ax.set(title="count of articles per year", ylabel="count of stories (#)")
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_dir / filename)
|
||||
plt.savefig(save_to)
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
@click.command('plot:distinct-publishers')
|
||||
def distinct_publishers():
|
||||
filename = 'distinct_publishers.png'
|
||||
save_to = paths('figures') / 'distinct_publishers.png'
|
||||
|
||||
DB = connect()
|
||||
with connect() as db:
|
||||
data = DB.query("""
|
||||
select
|
||||
year(published_at) as year
|
||||
|
@ -42,30 +40,32 @@ def distinct_publishers():
|
|||
group by
|
||||
year(published_at)
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.barplot(x=data.year, y=data.publishers, color='tab:blue')
|
||||
ax.tick_params(axis='x', rotation=90)
|
||||
ax.set(title="count of publishers per year", ylabel="count of publishers (#)")
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_dir / filename)
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
@click.command('plot:stories-per-publisher')
|
||||
def stories_per_publisher():
|
||||
filename = 'stories_per_publisher.png'
|
||||
save_to = paths('figures') / 'stories_per_publisher.png'
|
||||
|
||||
DB = connect()
|
||||
data = DB.query("""
|
||||
with connect() as db:
|
||||
data = db.query("""
|
||||
with cte as (
|
||||
select
|
||||
publisher_id
|
||||
,year(published_at) as year
|
||||
ps.publisher_id
|
||||
,year(s.published_at) as year
|
||||
,count(1) as stories
|
||||
from stories
|
||||
from stories s
|
||||
join mbfc.publisher_stories ps
|
||||
on ps.story_id = s.id
|
||||
group by
|
||||
publisher_id
|
||||
,year(published_at)
|
||||
ps.publisher_id
|
||||
,year(s.published_at)
|
||||
) , agg as (
|
||||
select
|
||||
publisher_id
|
||||
|
@ -91,64 +91,86 @@ def stories_per_publisher():
|
|||
group by
|
||||
max_avg
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.barplot(x=data.max_avg, y=data.publishers, color='tab:blue')
|
||||
ax.set(title="histogram of publisher stories per year", ylabel="count of publishers (#)", xlabel="max average stories / year")
|
||||
ax.set(title="histogram of publisher stories per year", ylabel="count of publishers (#)", xlabel="avg. stories / year", xticklabels=['2', '4', '8', '16', '32', '64', '128', '>128'])
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_dir / filename)
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
|
||||
@click.command('plot:top-publishers')
|
||||
def top_publishers():
|
||||
"""plot top publishers over time"""
|
||||
|
||||
filename = 'top_publishers.png'
|
||||
save_to = paths('figures') / 'top_publishers.png'
|
||||
|
||||
DB = connect()
|
||||
data = DB.query("""
|
||||
select
|
||||
p.tld
|
||||
,year(published_at) as year
|
||||
,count(1) as stories
|
||||
from (
|
||||
select
|
||||
with connect() as db:
|
||||
db.query("""
|
||||
SELECT
|
||||
p.tld
|
||||
,p.id
|
||||
from top.publishers p
|
||||
join top.stories s
|
||||
on s.publisher_id = p.id
|
||||
group by
|
||||
FROM mbfc.publishers p
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.publisher_id = p.id
|
||||
JOIN stories s
|
||||
ON s.id = ps.story_id
|
||||
GROUP BY
|
||||
p.tld
|
||||
,p.id
|
||||
order by count(1) desc
|
||||
limit 20
|
||||
) p
|
||||
join top.stories s
|
||||
on s.publisher_id = p.id
|
||||
group by
|
||||
""")
|
||||
|
||||
with connect() as db:
|
||||
data = db.query("""
|
||||
WITH p as (
|
||||
SELECT
|
||||
p.tld
|
||||
,year(published_at)
|
||||
order by count(distinct s.id) desc
|
||||
,p.id
|
||||
FROM mbfc.publishers p
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.publisher_id = p.id
|
||||
JOIN stories s
|
||||
ON s.id = ps.story_id
|
||||
GROUP BY
|
||||
p.tld
|
||||
,p.id
|
||||
order by count(1) desc
|
||||
limit 20
|
||||
)
|
||||
SELECT
|
||||
p.tld
|
||||
,YEAR(s.published_at) AS year
|
||||
,COUNT(1) AS stories
|
||||
FROM stories s
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.story_id = s.id
|
||||
JOIN p
|
||||
ON p.id = ps.publisher_id
|
||||
GROUP by
|
||||
p.tld
|
||||
,YEAR(published_at)
|
||||
ORDER BY year, COUNT(DISTINCT s.id) DESC
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
pivot = data.pivot(columns='year', index='tld', values='stories')
|
||||
ax = sns.heatmap(pivot, cmap="crest")
|
||||
ax.set(title="top 20 publishers (by tld)", ylabel="tld", xlabel="stories / year (#)")
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_dir / filename)
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
|
||||
@click.command('plot:common_tld')
|
||||
def common_tld():
|
||||
import dataframe_image as dfi
|
||||
filename = 'common_tld.png'
|
||||
save_to = paths('figures') / 'common_tld.png'
|
||||
|
||||
DB = connect()
|
||||
data = DB.query("""
|
||||
with connect() as db:
|
||||
data = db.query("""
|
||||
select
|
||||
split_part(url, '.', -1) as tld
|
||||
,count(1) as publishers
|
||||
|
@ -162,8 +184,7 @@ def common_tld():
|
|||
order by
|
||||
count(1) desc
|
||||
""").df()
|
||||
DB.close()
|
||||
data[:15][['tld', 'publishers']].style.hide(axis="index").export_png(out_dir / filename, table_conversion='matplotlib')
|
||||
data[:15][['tld', 'publishers']].style.hide(axis="index").export_png(save_to, table_conversion='matplotlib')
|
||||
|
||||
def stats():
|
||||
|
||||
|
@ -246,7 +267,7 @@ def stats():
|
|||
@click.command('plot:bias-stats')
|
||||
def bias_stats():
|
||||
import dataframe_image as dfi
|
||||
filename = 'bias_stats.png'
|
||||
save_to = paths('figures') / 'bias_stats.png'
|
||||
|
||||
DB = connect()
|
||||
|
||||
|
@ -300,3 +321,69 @@ def bias_stats():
|
|||
""").df()
|
||||
DB.close()
|
||||
print(df.to_markdown(index=False))
|
||||
|
||||
@click.command('plot:bias-over-time')
|
||||
def bias_over_time():
|
||||
"""plot bias labels over time"""
|
||||
|
||||
save_to = paths('figures') / 'bias_over_time.png'
|
||||
|
||||
with connect() as db:
|
||||
df = db.sql("""
|
||||
SELECT
|
||||
p.bias
|
||||
,p.id
|
||||
,date_trunc('year', s.published_at) as year
|
||||
,count(1) as stories
|
||||
FROM stories s
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.story_id = s.id
|
||||
JOIN mbfc.publishers p
|
||||
ON p.id = ps.publisher_id
|
||||
where year(s.published_at) not in (2006, 2023)
|
||||
and p.ordinal != -1
|
||||
GROUP BY
|
||||
p.bias
|
||||
,p.id
|
||||
,p.ordinal
|
||||
,date_trunc('year', s.published_at)
|
||||
order by
|
||||
p.ordinal
|
||||
,date_trunc('year', s.published_at)
|
||||
""").df()
|
||||
|
||||
ax = sns.relplot(df, kind='line', x='year', y='stories', col='bias', units='id', estimator=None, palette='rainbow')
|
||||
ax.set(ylabel="stories", xlabel="year")
|
||||
plt.tight_layout()
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
def bias_missing():
|
||||
|
||||
with connect() as db:
|
||||
df = db.sql("""
|
||||
SELECT
|
||||
date_trunc('year', s.published_at) as year
|
||||
,s.tld
|
||||
,count(1) as stories
|
||||
FROM stories s
|
||||
LEFT JOIN mbfc.publisher_stories ps
|
||||
ON ps.story_id = s.id
|
||||
WHERE ps.publisher_id is NULL
|
||||
AND year(s.published_at) not in (2006, 2023)
|
||||
GROUP BY
|
||||
s.tld
|
||||
,date_trunc('year', s.published_at)
|
||||
HAVING count(1) > 10
|
||||
ORDER BY
|
||||
date_trunc('year', s.published_at)
|
||||
""").df()
|
||||
|
||||
ax = sns.lineplot(df, x='year', y='stories', units='tld', estimator=None)
|
||||
ax.set(ylabel="stories", xlabel="year")
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
#plt.savefig(save_to)
|
||||
plt.close()
|
||||
#print(f"saved: {save_to}")
|
||||
|
|
|
@ -1,25 +1,24 @@
|
|||
import click
|
||||
from data.main import connect
|
||||
from data.main import connect, paths, ticklabels
|
||||
import os
|
||||
from pathlib import Path
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
|
||||
|
||||
@click.command('plot:emotion-over-time')
|
||||
def emotion_over_time():
|
||||
filename = "emotion_over_time.png"
|
||||
DB = connect()
|
||||
|
||||
emotions = DB.sql("""
|
||||
filename = "emotion_over_time.png"
|
||||
save_to = paths('figures') / filename
|
||||
|
||||
with connect() as db:
|
||||
emotions = db.sql("""
|
||||
SELECT
|
||||
date_trunc('year', s.published_at) AS year
|
||||
,e.label AS emotion
|
||||
,count(1) AS stories
|
||||
FROM top.stories s
|
||||
FROM stories s
|
||||
JOIN story_emotions se
|
||||
ON s.id = se.story_id
|
||||
JOIN emotions e
|
||||
|
@ -28,50 +27,53 @@ def emotion_over_time():
|
|||
date_trunc('year', s.published_at)
|
||||
,e.label
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
|
||||
ax.set(title="title emotions over years", xlabel="year", ylabel="stories (#)")
|
||||
plt.savefig(out_path / filename)
|
||||
print(f"saved: {filename}")
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved: {save_to}")
|
||||
os.system(f'xdg-open {save_to}')
|
||||
|
||||
@click.command('plot:emotion-regression')
|
||||
def emotion_regression():
|
||||
"""plot emotion over time as regression"""
|
||||
|
||||
from sklearn import linear_model
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import ConfusionMatrixDisplay
|
||||
|
||||
filename = "emotion_regression.png"
|
||||
save_to = paths('figures') / filename
|
||||
|
||||
|
||||
DB = connect()
|
||||
emotions = DB.query("""
|
||||
SELECT
|
||||
label
|
||||
FROM emotions e
|
||||
""").df()['label'].to_list()
|
||||
DB.close()
|
||||
|
||||
DB = connect()
|
||||
df = DB.sql(f"""
|
||||
with connect() as db:
|
||||
#emotions = db.query("""
|
||||
# SELECT
|
||||
# label
|
||||
# FROM emotions e
|
||||
#""").df()['label'].to_list()
|
||||
df = db.sql(f"""
|
||||
SELECT
|
||||
epoch(date_trunc('yearweek', s.published_at)) AS date
|
||||
,e.id AS emotion_id
|
||||
,p.id as publisher_id
|
||||
,count(1) AS stories
|
||||
FROM top.stories s
|
||||
JOIN top.publishers p
|
||||
ON p.id = s.publisher_id
|
||||
FROM stories s
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.story_id = s.id
|
||||
JOIN mbfc.publishers p
|
||||
ON p.id = ps.publisher_id
|
||||
JOIN story_emotions se
|
||||
ON s.id = se.story_id
|
||||
JOIN emotions e
|
||||
ON e.id = se.emotion_id
|
||||
WHERE p.ordinal != -1
|
||||
GROUP by
|
||||
epoch(date_trunc('yearweek', s.published_at))
|
||||
,p.id
|
||||
,e.id
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
results = []
|
||||
for (emotion_id, publisher_id), group in df.groupby(['emotion_id', 'publisher_id']):
|
||||
|
@ -83,77 +85,59 @@ def emotion_regression():
|
|||
results.append({'emotion_id' : emotion_id, 'publisher_id':publisher_id, 'per_year' : per_year})
|
||||
results = pd.DataFrame(results)
|
||||
|
||||
DB = connect()
|
||||
out = DB.query("""
|
||||
with connect() as db:
|
||||
out = db.query("""
|
||||
SELECT
|
||||
e.label as emotion
|
||||
--,p.tld
|
||||
,avg(results.per_year) as avg_reg_coef
|
||||
,b.ordinal
|
||||
,p.bias
|
||||
FROM results
|
||||
JOIN emotions e
|
||||
ON e.id = results.emotion_id
|
||||
JOIN top.publishers p
|
||||
JOIN mbfc.publishers p
|
||||
ON p.id = results.publisher_id
|
||||
JOIN publisher_bias pb
|
||||
ON pb.publisher_id = results.publisher_id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
GROUP BY
|
||||
e.label
|
||||
,b.ordinal
|
||||
,p.bias
|
||||
""").df()
|
||||
DB.close()
|
||||
pivot = out.pivot(index=['emotion'], columns=['ordinal'], values=['avg_reg_coef'])
|
||||
|
||||
ax = sns.heatmap(pivot, cmap='RdBu_r')
|
||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||
pivot = out.pivot(index=['emotion'], columns=['bias'], values=['avg_reg_coef'])
|
||||
|
||||
ax = sns.heatmap(pivot, cmap='BrBG', vmin=-0.01, vmax=0.01, center=0)
|
||||
#ax = sns.heatmap(pivot, cmap='RdBu_r', center=0)
|
||||
ax.set(title="slope of regression (stories/year) by bias and emotion"
|
||||
,xticklabels=ticklabels
|
||||
,xticklabels=ticklabels()
|
||||
,xlabel="bias"
|
||||
,ylabel="emotion")
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_path / filename)
|
||||
print(f"saved: {filename}")
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
@click.command('plot:emotion-hist')
|
||||
def emotion_hist():
|
||||
|
||||
filename = "emotion_hist.png"
|
||||
save_to = paths('figures') / filename
|
||||
|
||||
DB = connect()
|
||||
DB.query("""describe story_emotions""")
|
||||
|
||||
DB.query("""
|
||||
select
|
||||
e.label
|
||||
,count(distinct s.id) as stories
|
||||
,count(distinct s.publisher_id) as publishers
|
||||
from story_emotions se
|
||||
join emotions e
|
||||
on e.id = se.emotion_id
|
||||
join top.stories s
|
||||
on s.id = se.story_id
|
||||
group by
|
||||
e.label
|
||||
""").df().to_markdown(index=False)
|
||||
|
||||
data = DB.sql("""
|
||||
with connect() as db:
|
||||
data = db.sql("""
|
||||
SELECT
|
||||
b.ordinal
|
||||
p.bias
|
||||
,count(1) as stories
|
||||
FROM stories s
|
||||
JOIN publisher_bias pb
|
||||
ON pb.publisher_id = s.publisher_id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.story_id = s.id
|
||||
JOIN mbfc.publishers p
|
||||
ON p.id = ps.publisher_id
|
||||
WHERE p.ordinal != -1
|
||||
GROUP BY
|
||||
b.ordinal
|
||||
p.bias
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
|
||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
|
||||
ax = sns.barplot(data, x='bias', y='stories', palette='rainbow', order=ticklabels())
|
||||
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_path / filename)
|
||||
print(f"saved: {filename}")
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved: {save_to}")
|
||||
|
|
|
@ -9,20 +9,20 @@ import numpy as np
|
|||
from sklearn.metrics import silhouette_score
|
||||
import pandas as pd
|
||||
|
||||
out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
|
||||
|
||||
@click.command('plot:link-elbow')
|
||||
def elbow():
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
filename = 'link_cluster_elbow.png'
|
||||
save_to = paths('figures') / 'link_cluster_elbow.png'
|
||||
|
||||
DB = connect()
|
||||
df = DB.query("""
|
||||
with connect() as db:
|
||||
df = db.query("""
|
||||
SELECT
|
||||
*
|
||||
FROM link_edges
|
||||
""").df()
|
||||
|
||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
|
||||
to_plot = []
|
||||
|
@ -36,8 +36,9 @@ def elbow():
|
|||
|
||||
ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia)
|
||||
ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
|
||||
plt.savefig(out_dir / filename)
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved plot: {save_to}")
|
||||
|
||||
# randomly pick 8
|
||||
|
||||
|
@ -45,10 +46,10 @@ def elbow():
|
|||
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
||||
def link_pca_clusters(source):
|
||||
|
||||
filename = f"link_pca_clusters_{source}.png"
|
||||
save_to = paths('figures') / f"link_pca_clusters_{source}.png"
|
||||
|
||||
DB = connect()
|
||||
df = DB.query(f"""
|
||||
with connect() as db:
|
||||
df = db.query(f"""
|
||||
SELECT
|
||||
c.label as cluster
|
||||
,p.tld
|
||||
|
@ -72,21 +73,17 @@ def link_pca_clusters(source):
|
|||
JOIN top.publisher_pca_{source} pca
|
||||
ON pca.publisher_id = p.id
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster'])
|
||||
ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component")
|
||||
plt.savefig(out_dir / filename)
|
||||
|
||||
# .df().groupby(['cluster', 'bias']).describe()
|
||||
|
||||
|
||||
plt.savefig(save_to)
|
||||
print(f"saved plot: {save_to}")
|
||||
|
||||
|
||||
def test():
|
||||
data_dir = Path(os.getenv('DATA_MINING_DATA_DIR'))
|
||||
|
||||
DB.query("""
|
||||
with connect() as db:
|
||||
db.query("""
|
||||
SELECT
|
||||
p.id as publisher_id
|
||||
,p.name
|
||||
|
@ -109,9 +106,6 @@ def test():
|
|||
ORDER BY count(1) desc
|
||||
""")
|
||||
|
||||
# .df().to_csv(data_dir / 'cluster_publishers.csv', sep="|", index=False)
|
||||
DB.close()
|
||||
|
||||
|
||||
@click.command('plot:link-confusion')
|
||||
def link_confusion():
|
||||
|
@ -120,9 +114,10 @@ def link_confusion():
|
|||
from sklearn.metrics import ConfusionMatrixDisplay
|
||||
|
||||
filename = "link_confusion.png"
|
||||
save_to = paths('figures') / filename
|
||||
|
||||
DB = connect()
|
||||
bias = DB.query("""
|
||||
with connect() as db:
|
||||
bias = db.query("""
|
||||
SELECT
|
||||
p.id as publisher_id
|
||||
,b.ordinal
|
||||
|
@ -133,7 +128,7 @@ def link_confusion():
|
|||
ON b.id = pb.bias_id
|
||||
""").df()
|
||||
|
||||
df = DB.query("""
|
||||
df = db.query("""
|
||||
SELECT
|
||||
*
|
||||
FROM top.link_edges
|
||||
|
@ -148,6 +143,7 @@ def link_confusion():
|
|||
from bias
|
||||
)
|
||||
""").df()
|
||||
|
||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
|
||||
x = pivot.values
|
||||
|
@ -166,9 +162,9 @@ def link_confusion():
|
|||
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
|
||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
|
||||
plt.savefig(out_dir / filename)
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved plot: {filename}")
|
||||
print(f"saved plot: {save_to}")
|
||||
|
||||
@click.command('plot:link-classifier')
|
||||
def link_confusion():
|
||||
|
@ -176,10 +172,10 @@ def link_confusion():
|
|||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.metrics import ConfusionMatrixDisplay
|
||||
|
||||
filename = "link_confusion.png"
|
||||
save_to = paths('figures') / "link_confusion.png"
|
||||
|
||||
DB = connect()
|
||||
bias = DB.query("""
|
||||
with connect() as db:
|
||||
bias = db.query("""
|
||||
SELECT
|
||||
p.id as publisher_id
|
||||
,b.ordinal
|
||||
|
@ -190,7 +186,7 @@ def link_confusion():
|
|||
ON b.id = pb.bias_id
|
||||
""").df()
|
||||
|
||||
df = DB.query("""
|
||||
df = db.query("""
|
||||
SELECT
|
||||
*
|
||||
FROM top.link_edges
|
||||
|
@ -205,12 +201,14 @@ def link_confusion():
|
|||
from bias
|
||||
)
|
||||
""").df()
|
||||
|
||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
|
||||
x = pivot.values
|
||||
y = bias.sort_values('publisher_id').ordinal
|
||||
|
||||
data = DB.query(f"""
|
||||
with connect() as db:
|
||||
data = db.query(f"""
|
||||
SELECT
|
||||
p.id as publisher_id
|
||||
,pca.first
|
||||
|
@ -235,11 +233,11 @@ def link_confusion():
|
|||
ConfusionMatrixDisplay.from_predictions(data['ordinal'], data['pred'], ax=ax)
|
||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||
ax.set(title="confusion matrix for link matrix kNN classifier", xticklabels=ticklabels, yticklabels=ticklabels)
|
||||
plt.savefig(out_dir / filename)
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved plot: {filename}")
|
||||
print(f"saved plot: {save_to}")
|
||||
|
||||
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
|
||||
plt.savefig(out_dir / filename)
|
||||
plt.close()
|
||||
print(f"saved plot: {filename}")
|
||||
# ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
|
||||
# plt.savefig(out_dir / filename)
|
||||
# plt.close()
|
||||
# print(f"saved plot: {filename}")
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import click
|
||||
from data.main import connect
|
||||
from data.main import connect, paths
|
||||
import os
|
||||
from pathlib import Path
|
||||
import seaborn as sns
|
||||
|
@ -7,15 +7,12 @@ import matplotlib.pyplot as plt
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
|
||||
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
|
||||
|
||||
@click.command('plot:sentence-pca')
|
||||
def sentence_pca():
|
||||
filename = "embedding_sentence_pca.png"
|
||||
DB = connect()
|
||||
save_to = paths('figures') / "embedding_sentence_pca.png"
|
||||
|
||||
data = DB.query("""
|
||||
with connect() as db:
|
||||
data = db.query("""
|
||||
SELECT
|
||||
pca.first
|
||||
,pca.second
|
||||
|
@ -28,18 +25,17 @@ def sentence_pca():
|
|||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
|
||||
ax.set(title="pca components vs. bias label", xlabel="first component", ylabel="second component")
|
||||
plt.savefig(out_path / filename)
|
||||
plt.savefig(save_to)
|
||||
|
||||
@click.command('plot:avg-sentence-pca')
|
||||
def avg_sentence_pca():
|
||||
filename = "avg_embedding_sentence_pca.png"
|
||||
DB = connect()
|
||||
save_to = paths('figures') / "avg_embedding_sentence_pca.png"
|
||||
|
||||
data = DB.query("""
|
||||
with connect() as db:
|
||||
data = db.query("""
|
||||
SELECT
|
||||
pca.first
|
||||
,pca.second
|
||||
|
@ -53,11 +49,10 @@ def avg_sentence_pca():
|
|||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
|
||||
ax.set(title="avg. publisher embedding pca components vs. bias label", xlabel="first component", ylabel="second component")
|
||||
plt.savefig(out_path / filename)
|
||||
plt.savefig(save_to)
|
||||
|
||||
@click.command('plot:sentence-confusion')
|
||||
def sentence_confusion():
|
||||
|
@ -65,14 +60,14 @@ def sentence_confusion():
|
|||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.metrics import ConfusionMatrixDisplay
|
||||
|
||||
filename = "sentence_confusion.png"
|
||||
save_to = paths('figures') / "sentence_confusion.png"
|
||||
|
||||
embeddings = np.load(data_path / 'embeddings.npy')
|
||||
embedding_ids = np.load(data_path / 'embedding_ids.npy')
|
||||
embeddings = np.load(paths('data') / 'embeddings.npy')
|
||||
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
|
||||
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
||||
|
||||
DB = connect()
|
||||
data = DB.query("""
|
||||
with connect() as db:
|
||||
data = db.query("""
|
||||
SELECT
|
||||
ids.index
|
||||
,s.id
|
||||
|
@ -85,12 +80,11 @@ def sentence_confusion():
|
|||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
""").df()
|
||||
pub = DB.query("""
|
||||
pub = db.query("""
|
||||
SELECT
|
||||
*
|
||||
FROM top.publishers
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
train, test = train_test_split(data)
|
||||
train_x, train_y = embeddings[train['index']], train['ordinal']
|
||||
|
@ -105,7 +99,7 @@ def sentence_confusion():
|
|||
ConfusionMatrixDisplay.from_predictions(test_y, pred, ax=ax)
|
||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
|
||||
plt.savefig(out_path / filename)
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
|
||||
print(f"saved plot: {filename}")
|
||||
print(f"saved plot: {save_to}")
|
||||
|
|
|
@ -1,20 +1,16 @@
|
|||
import click
|
||||
from data.main import connect
|
||||
import os
|
||||
from pathlib import Path
|
||||
from data.main import connect, paths, ticklabels
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
|
||||
|
||||
@click.command('plot:sentiment-over-time')
|
||||
def over_time():
|
||||
filename = "sentiment_over_time.png"
|
||||
|
||||
DB = connect()
|
||||
data = DB.sql("""
|
||||
filename = "sentiment_over_time.png"
|
||||
save_to = paths('figures') / filename
|
||||
|
||||
with connect() as db:
|
||||
data = db.sql("""
|
||||
SELECT
|
||||
avg(sent.class_id) as sentiment
|
||||
,s.published_at as date
|
||||
|
@ -24,115 +20,116 @@ def over_time():
|
|||
GROUP BY
|
||||
s.published_at
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.scatterplot(x=data['date'], y=data['sentiment'])
|
||||
ax.set(title="sentiment vs. time")
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_path / filename)
|
||||
print(f"saved: {filename}")
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
@click.command('plot:bias-vs-sentiment-over-time')
|
||||
def bias_over_time():
|
||||
filename = "bias_vs_sentiment_over_time.png"
|
||||
"""plot sentiment/bias vs. time"""
|
||||
|
||||
DB = connect()
|
||||
data = DB.sql("""
|
||||
filename = "bias_vs_sentiment_over_time.png"
|
||||
save_to = paths('figures') / filename
|
||||
|
||||
with connect() as db:
|
||||
data = db.sql("""
|
||||
with cte as (
|
||||
SELECT
|
||||
avg(sent.class_id) as sentiment
|
||||
,date_trunc('yearweek', s.published_at) as date
|
||||
--,b.ordinal as ordinal
|
||||
,b.bias
|
||||
FROM top.story_sentiments sent
|
||||
JOIN top.stories s
|
||||
,p.bias
|
||||
FROM story_sentiments sent
|
||||
JOIN stories s
|
||||
ON s.id = sent.story_id
|
||||
JOIN publisher_bias pb
|
||||
ON pb.publisher_id = s.publisher_id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.story_id = s.id
|
||||
JOIN mbfc.publishers p
|
||||
ON p.id = ps.publisher_id
|
||||
WHERE p.ordinal != -1
|
||||
GROUP BY
|
||||
date_trunc('yearweek', s.published_at)
|
||||
,b.bias
|
||||
,p.bias
|
||||
)
|
||||
SELECT
|
||||
median(sentiment) OVER (PARTITION BY bias ORDER BY date DESC ROWS BETWEEN 0 PRECEDING AND 7 FOLLOWING) as sentiment
|
||||
,date
|
||||
,bias
|
||||
FROM cte
|
||||
WHERE year(date) not in (2005, 2023)
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
order = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||
ax = sns.relplot(data, x='date', y='sentiment', col='bias', col_order=order)
|
||||
#ax = sns.relplot(data, x='date', y='sentiment', col='bias', palette='rainbow', hue='bias', col_order=ticklabels())
|
||||
ax = sns.lineplot(data, x='date', y='sentiment', palette='rainbow', hue='bias', hue_order=ticklabels())
|
||||
plt.axhline(y=0.5, color='black', linestyle='--', label='neutral')
|
||||
ax.set(title='sentiment and bias vs. time', ylabel='8 week rolling avg. sentiment', xlabel='date')
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_path / filename)
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved: {filename}")
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
@click.command('plot:sentiment-recent-winner')
|
||||
def bias_vs_recent_winner():
|
||||
filename = "bias_vs_recent_winner.png"
|
||||
"""plot bias vs. distance to election"""
|
||||
|
||||
DB = connect()
|
||||
data = DB.sql("""
|
||||
filename = "bias_vs_recent_winner.png"
|
||||
save_to = paths('figures') / filename
|
||||
|
||||
with connect() as db:
|
||||
data = db.sql("""
|
||||
SELECT
|
||||
e.days_away as days_away
|
||||
,b.ordinal
|
||||
round(e.days_away, -1) as days_away
|
||||
,p.bias
|
||||
,avg(sent.class_id) as sentiment
|
||||
,count(1) as stories
|
||||
FROM top.stories s
|
||||
JOIN top.story_sentiments sent
|
||||
FROM stories s
|
||||
JOIN story_sentiments sent
|
||||
ON s.id = sent.story_id
|
||||
JOIN election_distance e
|
||||
ON e.publish_date = s.published_at
|
||||
JOIN publisher_bias pb
|
||||
ON pb.publisher_id = s.publisher_id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.story_id = s.id
|
||||
JOIN mbfc.publishers p
|
||||
ON p.id = ps.publisher_id
|
||||
GROUP BY
|
||||
e.days_away
|
||||
,b.ordinal
|
||||
round(e.days_away, -1)
|
||||
,p.bias
|
||||
""").df()
|
||||
DB.close()
|
||||
data
|
||||
|
||||
ax = sns.scatterplot(x=data['days_away'], y=data['sentiment'], hue=data['ordinal'])
|
||||
ax = sns.scatterplot(data, x='days_away', y='sentiment', hue='bias', hue_order=ticklabels(), palette='rainbow')
|
||||
ax.set(title="sentiment vs. days to nearest election", xlabel="days to nearest election", ylabel="average title seniment")
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_path / filename)
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
|
||||
print(f"saved: {filename}")
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
@click.command('plot:sentiment-hist')
|
||||
def sentiment_hist():
|
||||
|
||||
filename = "sentiment_hist.png"
|
||||
save_to = paths('figures') / filename
|
||||
|
||||
DB = connect()
|
||||
|
||||
DB.query("""
|
||||
select
|
||||
sent.label
|
||||
,count(distinct s.id) as stories
|
||||
,count(distinct s.publisher_id) as publishers
|
||||
from top.story_sentiments sent
|
||||
join top.stories s
|
||||
on s.id = sent.story_id
|
||||
group by
|
||||
sent.label
|
||||
""").df().to_markdown(index=False)
|
||||
|
||||
data = DB.sql("""
|
||||
with connect() as db:
|
||||
data = db.sql("""
|
||||
SELECT
|
||||
b.ordinal
|
||||
p.bias
|
||||
,count(1) as stories
|
||||
FROM stories s
|
||||
JOIN publisher_bias pb
|
||||
ON pb.publisher_id = s.publisher_id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.story_id = s.id
|
||||
JOIN mbfc.publishers p
|
||||
ON p.id = ps.publisher_id
|
||||
WHERE p.ordinal != -1
|
||||
GROUP BY
|
||||
b.ordinal
|
||||
p.bias
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
|
||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
|
||||
ax = sns.barplot(data, x='bias', y='stories', hue='bias', palette='rainbow')
|
||||
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_path / filename)
|
||||
print(f"saved: {filename}")
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved: {save_to}")
|
||||
|
|
|
@ -1,48 +0,0 @@
|
|||
from data.main import connect
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
DB = connect()
|
||||
edges = DB.query("""
|
||||
select
|
||||
*
|
||||
from link_edges
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
edges
|
||||
|
||||
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
|
||||
|
||||
DB = connect()
|
||||
DB.query("create schema top")
|
||||
|
||||
DB.query("""
|
||||
CREATE OR REPLACE TABLE top.publishers AS
|
||||
SELECT
|
||||
p.*
|
||||
FROM publishers p
|
||||
JOIN select_publishers s
|
||||
ON s.publisher_id = p.id
|
||||
""")
|
||||
|
||||
DB.query("""
|
||||
CREATE OR REPLACE TABLE top.stories AS
|
||||
SELECT
|
||||
s.*
|
||||
FROM stories s
|
||||
JOIN top.publishers p
|
||||
ON s.publisher_id = p.id
|
||||
WHERE year(s.published_at) >= 2006
|
||||
AND year(s.published_at) < 2023
|
||||
""")
|
||||
|
||||
DB.query("""
|
||||
CREATE OR REPLACE TABLE top.related_stories AS
|
||||
SELECT
|
||||
r.*
|
||||
FROM top.stories s
|
||||
JOIN related_stories r
|
||||
ON s.id = r.parent_id
|
||||
""")
|
|
@ -1,7 +1,7 @@
|
|||
from transformers import AutoTokenizer, AutoModel
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from data.main import connect, data_dir
|
||||
from data.main import connect, paths
|
||||
import os
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
|
@ -62,7 +62,7 @@ def embed(chunks):
|
|||
ids = np.concatenate(embedding_ids)
|
||||
|
||||
# save embeddings
|
||||
save_to = data_dir() / 'embeddings.npy'
|
||||
save_to = paths('data') / 'embeddings.npy'
|
||||
np.save(save_to, embeddings)
|
||||
print(f"embeddings saved: {save_to}")
|
||||
|
||||
|
@ -75,15 +75,15 @@ def embed(chunks):
|
|||
@click.command('sentence:create-avg-pca-table')
|
||||
def create_avg_pca_table():
|
||||
from sklearn.decomposition import PCA
|
||||
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
|
||||
|
||||
|
||||
embeddings = np.load(data_path / 'embeddings.npy')
|
||||
embedding_ids = np.load(data_path / 'embedding_ids.npy')
|
||||
embeddings = np.load(paths('data') / 'embeddings.npy')
|
||||
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
|
||||
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
||||
|
||||
DB = connect()
|
||||
data = DB.query("""
|
||||
|
||||
with connect() as db:
|
||||
data = db.query("""
|
||||
SELECT
|
||||
ids.index
|
||||
,s.id
|
||||
|
@ -97,7 +97,6 @@ def create_avg_pca_table():
|
|||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
results = []
|
||||
for publisher_id, group in data.groupby(['publisher_id']):
|
||||
|
@ -115,8 +114,8 @@ def create_avg_pca_table():
|
|||
results['second'] = pred[:, 1]
|
||||
|
||||
table_name = "top.publisher_embeddings_pca"
|
||||
DB = connect()
|
||||
DB.query(f"""
|
||||
with connect() as db:
|
||||
db.query(f"""
|
||||
CREATE OR REPLACE TABLE {table_name} AS
|
||||
SELECT
|
||||
results.publisher_id as publisher_id
|
||||
|
@ -124,20 +123,19 @@ def create_avg_pca_table():
|
|||
,results.second as second
|
||||
FROM results
|
||||
""")
|
||||
DB.close()
|
||||
|
||||
print(f"created {table_name}")
|
||||
|
||||
|
||||
@click.command('sentence:create-pca-table')
|
||||
def create_pca_table():
|
||||
from sklearn.decomposition import PCA
|
||||
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
|
||||
|
||||
embeddings = np.load(data_path / 'embeddings.npy')
|
||||
embedding_ids = np.load(data_path / 'embedding_ids.npy')
|
||||
embeddings = np.load(path('data') / 'embeddings.npy')
|
||||
embedding_ids = np.load(path('data') / 'embedding_ids.npy')
|
||||
|
||||
DB = connect()
|
||||
data = DB.query("""
|
||||
with connect() as db:
|
||||
data = db.query("""
|
||||
SELECT
|
||||
ids.index
|
||||
,s.id
|
||||
|
@ -150,12 +148,11 @@ def create_pca_table():
|
|||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
""").df()
|
||||
pub = DB.query("""
|
||||
pub = db.query("""
|
||||
SELECT
|
||||
*
|
||||
FROM top.publishers
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
x = embeddings[data['index']]
|
||||
y = data['ordinal'].to_numpy().reshape(-1, 1)
|
||||
|
@ -166,8 +163,8 @@ def create_pca_table():
|
|||
|
||||
table_name = f"top.story_embeddings_pca"
|
||||
|
||||
DB = connect()
|
||||
DB.query(f"""
|
||||
with connect() as db:
|
||||
db.query(f"""
|
||||
CREATE OR REPLACE TABLE {table_name} AS
|
||||
SELECT
|
||||
data.id as story_id
|
||||
|
@ -175,21 +172,20 @@ def create_pca_table():
|
|||
,data.second as second
|
||||
FROM data
|
||||
""")
|
||||
DB.close()
|
||||
|
||||
print(f"created {table_name}")
|
||||
|
||||
@click.command('sentence:create-svm-table')
|
||||
def create_svm_table():
|
||||
from sklearn import svm
|
||||
from sklearn.linear_model import SGDClassifier
|
||||
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
|
||||
|
||||
embeddings = np.load(data_path / 'embeddings.npy')
|
||||
embedding_ids = np.load(data_path / 'embedding_ids.npy')
|
||||
embeddings = np.load(paths('data') / 'embeddings.npy')
|
||||
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
|
||||
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
||||
|
||||
DB = connect()
|
||||
data = DB.query("""
|
||||
with connect() as db:
|
||||
data = db.query("""
|
||||
SELECT
|
||||
ids.index
|
||||
,s.id
|
||||
|
|