add mbfc data. use context manager for db. add paths fn.
Before Width: | Height: | Size: 21 KiB After Width: | Height: | Size: 22 KiB |
After Width: | Height: | Size: 148 KiB |
Before Width: | Height: | Size: 16 KiB After Width: | Height: | Size: 19 KiB |
Before Width: | Height: | Size: 235 KiB After Width: | Height: | Size: 73 KiB |
Before Width: | Height: | Size: 202 KiB After Width: | Height: | Size: 104 KiB |
After Width: | Height: | Size: 22 KiB |
Before Width: | Height: | Size: 44 KiB After Width: | Height: | Size: 46 KiB |
Before Width: | Height: | Size: 30 KiB After Width: | Height: | Size: 36 KiB |
After Width: | Height: | Size: 40 KiB |
After Width: | Height: | Size: 87 KiB |
After Width: | Height: | Size: 66 KiB |
After Width: | Height: | Size: 40 KiB |
Before Width: | Height: | Size: 51 KiB After Width: | Height: | Size: 66 KiB |
After Width: | Height: | Size: 29 KiB |
Before Width: | Height: | Size: 128 KiB After Width: | Height: | Size: 128 KiB |
Before Width: | Height: | Size: 22 KiB After Width: | Height: | Size: 21 KiB |
Before Width: | Height: | Size: 54 KiB After Width: | Height: | Size: 53 KiB |
|
@ -0,0 +1,27 @@
|
||||||
|
from efficient_apriori import apriori
|
||||||
|
from data.main import connect
|
||||||
|
|
||||||
|
@click.command("apriori:rules")
|
||||||
|
def rules():
|
||||||
|
DB = connect()
|
||||||
|
data = DB.query("""
|
||||||
|
SELECT
|
||||||
|
--list_prepend(parent.id, list(child.id)) as transaction
|
||||||
|
list_prepend(parent.tld, list(child.tld)) as transaction
|
||||||
|
FROM stories s
|
||||||
|
JOIN related_stories r
|
||||||
|
ON r.parent_id = s.id
|
||||||
|
JOIN publishers parent
|
||||||
|
ON parent.id = s.publisher_id
|
||||||
|
JOIN publishers child
|
||||||
|
ON child.id = r.publisher_id
|
||||||
|
GROUP BY
|
||||||
|
--parent.id
|
||||||
|
parent.tld
|
||||||
|
""").df()
|
||||||
|
DB.close()
|
||||||
|
|
||||||
|
transactions = data.transaction.apply(lambda x: tuple(x)).values
|
||||||
|
|
||||||
|
itemsets, rules = apriori(transactions, min_support=0.1, min_confidence=0.8)
|
||||||
|
print(*rules, sep="\n")
|
185
src/bias.py
|
@ -1,67 +1,42 @@
|
||||||
import click
|
import click
|
||||||
from data.main import connect
|
from data.main import connect, paths
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import os
|
import os
|
||||||
import csv
|
import csv
|
||||||
|
|
||||||
def label_to_int(rating:str) -> int:
|
|
||||||
|
|
||||||
mapping = {
|
|
||||||
'left' : 0,
|
|
||||||
'left-center' : 1,
|
|
||||||
'center' : 2,
|
|
||||||
'right-center' : 3,
|
|
||||||
'right' : 4,
|
|
||||||
'allsides' : -1,
|
|
||||||
}
|
|
||||||
|
|
||||||
return mapping[rating]
|
|
||||||
|
|
||||||
def int_to_label(class_id: int) -> str:
|
|
||||||
mapping = {
|
|
||||||
0 : 'left',
|
|
||||||
1 : 'left-center',
|
|
||||||
2 : 'center',
|
|
||||||
3 : 'right-center',
|
|
||||||
4 : 'right',
|
|
||||||
-1 : 'allsides',
|
|
||||||
}
|
|
||||||
return mapping[class_id]
|
|
||||||
|
|
||||||
|
|
||||||
@click.command(name="bias:normalize")
|
@click.command(name="bias:normalize")
|
||||||
def normalize() -> None:
|
def normalize() -> None:
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
|
db.sql("""
|
||||||
DB.sql("""
|
CREATE OR REPLACE TABLE publisher_bias AS
|
||||||
CREATE OR REPLACE TABLE publisher_bias AS
|
WITH cte AS (
|
||||||
WITH cte AS (
|
SELECT
|
||||||
SELECT
|
p.id as publisher_id
|
||||||
p.id as publisher_id
|
,b.id as bias_id
|
||||||
,b.id as bias_id
|
,b.bias as label
|
||||||
,b.bias as label
|
,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity
|
||||||
,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity
|
FROM bias_ratings b
|
||||||
FROM bias_ratings b
|
JOIN top.publishers p
|
||||||
JOIN top.publishers p
|
ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95
|
||||||
ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95
|
),ranked AS (
|
||||||
),ranked AS (
|
SELECT
|
||||||
|
publisher_id
|
||||||
|
,bias_id
|
||||||
|
,label
|
||||||
|
,similarity
|
||||||
|
,ROW_NUMBER() OVER(PARTITION BY publisher_id ORDER BY similarity DESC) AS rn
|
||||||
|
FROM cte
|
||||||
|
)
|
||||||
SELECT
|
SELECT
|
||||||
publisher_id
|
publisher_id
|
||||||
,bias_id
|
|
||||||
,label
|
,label
|
||||||
,similarity
|
,bias_id
|
||||||
,ROW_NUMBER() OVER(PARTITION BY publisher_id ORDER BY similarity DESC) AS rn
|
FROM ranked
|
||||||
FROM cte
|
WHERE ranked.rn = 1
|
||||||
)
|
""")
|
||||||
SELECT
|
|
||||||
publisher_id
|
|
||||||
,label
|
|
||||||
,bias_id
|
|
||||||
FROM ranked
|
|
||||||
WHERE ranked.rn = 1
|
|
||||||
""")
|
|
||||||
|
|
||||||
mapping = [
|
mapping = [
|
||||||
{'label' :'left' , 'ordinal': -2},
|
{'label' :'left' , 'ordinal': -2},
|
||||||
|
@ -72,22 +47,20 @@ def normalize() -> None:
|
||||||
]
|
]
|
||||||
mapping = pd.DataFrame(mapping)
|
mapping = pd.DataFrame(mapping)
|
||||||
|
|
||||||
DB.query("alter table bias_ratings add column ordinal int")
|
with connect() as db:
|
||||||
|
db.query("alter table bias_ratings add column ordinal int")
|
||||||
DB.query("""
|
db.query("""
|
||||||
update bias_ratings b
|
update bias_ratings b
|
||||||
set ordinal = o.ordinal
|
set ordinal = o.ordinal
|
||||||
FROM mapping o
|
FROM mapping o
|
||||||
WHERE o.label = b.bias
|
WHERE o.label = b.bias
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
|
||||||
@click.command(name='bias:parse')
|
@click.command(name='bias:parse')
|
||||||
def parse() -> None:
|
def parse() -> None:
|
||||||
"""parse the save html page of allslides.com bias ratings into a normalized csv file"""
|
"""parse the save html page of allslides.com bias ratings into a normalized csv file"""
|
||||||
DB = connect()
|
bias_html = paths('data') / 'allsides.html'
|
||||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
|
||||||
bias_html = DATA_DIR / 'allsides.html'
|
|
||||||
|
|
||||||
parser = etree.HTMLParser()
|
parser = etree.HTMLParser()
|
||||||
tree = etree.parse(str(bias_html), parser)
|
tree = etree.parse(str(bias_html), parser)
|
||||||
|
@ -111,65 +84,63 @@ def parse() -> None:
|
||||||
rating['disagree'] = int(disagree)
|
rating['disagree'] = int(disagree)
|
||||||
ratings.append(rating)
|
ratings.append(rating)
|
||||||
df = pd.DataFrame(ratings)
|
df = pd.DataFrame(ratings)
|
||||||
df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
|
df.to_csv(paths('data') / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
|
||||||
|
|
||||||
@click.command(name="bias:load")
|
@click.command(name="bias:load")
|
||||||
def load() -> None:
|
def load() -> None:
|
||||||
DB = connect()
|
f = str(paths('data') / "bias_ratings.csv")
|
||||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
|
||||||
f = str(DATA_DIR / "bias_ratings.csv")
|
|
||||||
|
|
||||||
DB.sql(f"""
|
with connect() as db:
|
||||||
CREATE TABLE bias_ratings as
|
db.sql(f"""
|
||||||
select
|
CREATE TABLE bias_ratings as
|
||||||
row_number() over(order by b.publisher) as id
|
select
|
||||||
,b.*
|
row_number() over(order by b.publisher) as id
|
||||||
from read_csv_auto('{f}') b
|
,b.*
|
||||||
""")
|
from read_csv_auto('{f}') b
|
||||||
|
""")
|
||||||
|
|
||||||
@click.command('bias:export')
|
@click.command('bias:export')
|
||||||
def export():
|
def export():
|
||||||
data_path = Path(os.environ['DATA_MINING_DATA_DIR'])
|
with connect() as db:
|
||||||
|
all_bias = db.query("""
|
||||||
|
SELECT
|
||||||
|
id as bias_id
|
||||||
|
,publisher as name
|
||||||
|
,bias as label
|
||||||
|
FROM bias_ratings
|
||||||
|
ORDER by agree desc
|
||||||
|
""")
|
||||||
|
|
||||||
DB = connect()
|
all_bias.df().to_csv(paths('data') / 'TMP_publisher_bias.csv', sep="|", index=False)
|
||||||
all_bias = DB.query("""
|
with connect() as db:
|
||||||
SELECT
|
mapped_bias = db.query("""
|
||||||
id as bias_id
|
SELECT
|
||||||
,publisher as name
|
p.id as publisher_id
|
||||||
,bias as label
|
,p.name as name
|
||||||
FROM bias_ratings
|
,p.tld as tld
|
||||||
ORDER by agree desc
|
,b.label as bias
|
||||||
|
,b.bias_id as bias_id
|
||||||
|
FROM top.publishers p
|
||||||
|
LEFT JOIN publisher_bias b
|
||||||
|
ON b.publisher_id = p.id
|
||||||
""")
|
""")
|
||||||
all_bias.df().to_csv(data_path / 'TMP_publisher_bias.csv', sep="|", index=False)
|
mapped_bias.df().to_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|", index=False)
|
||||||
mapped_bias = DB.query("""
|
|
||||||
SELECT
|
|
||||||
p.id as publisher_id
|
|
||||||
,p.name as name
|
|
||||||
,p.tld as tld
|
|
||||||
,b.label as bias
|
|
||||||
,b.bias_id as bias_id
|
|
||||||
FROM top.publishers p
|
|
||||||
LEFT JOIN publisher_bias b
|
|
||||||
ON b.publisher_id = p.id
|
|
||||||
""")
|
|
||||||
mapped_bias.df().to_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|", index=False)
|
|
||||||
DB.close()
|
|
||||||
|
|
||||||
@click.command('bias:import-mapped')
|
@click.command('bias:import-mapped')
|
||||||
def import_mapped():
|
def import_mapped():
|
||||||
data_path = Path(os.environ['DATA_MINING_DATA_DIR'])
|
|
||||||
table_name = "top.publisher_bias"
|
table_name = "top.publisher_bias"
|
||||||
|
|
||||||
DB = connect()
|
df = pd.read_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|")
|
||||||
df = pd.read_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|")
|
|
||||||
|
with connect() as db:
|
||||||
|
db.query(f"""
|
||||||
|
CREATE OR REPLACE TABLE {table_name} AS
|
||||||
|
SELECT
|
||||||
|
publisher_id AS publisher_id
|
||||||
|
,cast(bias_id AS int) as bias_id
|
||||||
|
FROM df
|
||||||
|
WHERE bias_id IS NOT NULL
|
||||||
|
""")
|
||||||
|
|
||||||
DB.query(f"""
|
|
||||||
CREATE OR REPLACE TABLE {table_name} AS
|
|
||||||
SELECT
|
|
||||||
publisher_id AS publisher_id
|
|
||||||
,cast(bias_id AS int) as bias_id
|
|
||||||
FROM df
|
|
||||||
WHERE bias_id IS NOT NULL
|
|
||||||
""")
|
|
||||||
print(f"created table: {table_name}")
|
print(f"created table: {table_name}")
|
||||||
|
|
||||||
|
|
54
src/cli.py
|
@ -1,5 +1,7 @@
|
||||||
import click
|
import click
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
import data
|
||||||
|
import plots
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
def cli():
|
def cli():
|
||||||
|
@ -7,12 +9,20 @@ def cli():
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
from data import scrape
|
|
||||||
cli.add_command(scrape.download)
|
# original bias ratings
|
||||||
cli.add_command(scrape.parse)
|
cli.add_command(data.scrape.download)
|
||||||
cli.add_command(scrape.load)
|
cli.add_command(data.scrape.parse)
|
||||||
cli.add_command(scrape.normalize)
|
cli.add_command(data.scrape.load)
|
||||||
cli.add_command(scrape.create_elections_table)
|
cli.add_command(data.scrape.normalize)
|
||||||
|
cli.add_command(data.scrape.create_elections_table)
|
||||||
|
|
||||||
|
cli.add_command(data.factcheck.parse_index)
|
||||||
|
cli.add_command(data.factcheck.scrape)
|
||||||
|
|
||||||
|
cli.add_command(data.links.create_table)
|
||||||
|
cli.add_command(data.links.create_pca)
|
||||||
|
cli.add_command(data.links.create_clusters)
|
||||||
|
|
||||||
import word
|
import word
|
||||||
# cli.add_command(word.distance)
|
# cli.add_command(word.distance)
|
||||||
|
@ -23,10 +33,12 @@ if __name__ == "__main__":
|
||||||
cli.add_command(bias.parse)
|
cli.add_command(bias.parse)
|
||||||
cli.add_command(bias.load)
|
cli.add_command(bias.load)
|
||||||
cli.add_command(bias.normalize)
|
cli.add_command(bias.normalize)
|
||||||
|
|
||||||
import mine
|
import mine
|
||||||
cli.add_command(mine.embeddings)
|
cli.add_command(mine.embeddings)
|
||||||
cli.add_command(mine.cluster)
|
cli.add_command(mine.cluster)
|
||||||
cli.add_command(mine.plot)
|
cli.add_command(mine.plot)
|
||||||
|
|
||||||
import emotion
|
import emotion
|
||||||
cli.add_command(emotion.extract)
|
cli.add_command(emotion.extract)
|
||||||
cli.add_command(emotion.normalize)
|
cli.add_command(emotion.normalize)
|
||||||
|
@ -40,34 +52,20 @@ if __name__ == "__main__":
|
||||||
from train import main as train_main
|
from train import main as train_main
|
||||||
cli.add_command(train_main.main)
|
cli.add_command(train_main.main)
|
||||||
|
|
||||||
import plots.descriptive as plotd
|
cli.add_command(plots.descriptive.articles_per_year)
|
||||||
cli.add_command(plotd.articles_per_year)
|
cli.add_command(plots.descriptive.distinct_publishers)
|
||||||
cli.add_command(plotd.distinct_publishers)
|
cli.add_command(plots.descriptive.stories_per_publisher)
|
||||||
cli.add_command(plotd.stories_per_publisher)
|
cli.add_command(plots.descriptive.top_publishers)
|
||||||
cli.add_command(plotd.top_publishers)
|
cli.add_command(plots.descriptive.common_tld)
|
||||||
cli.add_command(plotd.common_tld)
|
|
||||||
|
|
||||||
import links as linkcli
|
|
||||||
cli.add_command(linkcli.create_table)
|
|
||||||
cli.add_command(linkcli.create_pca)
|
|
||||||
cli.add_command(linkcli.create_clusters)
|
|
||||||
|
|
||||||
import plots.links as plotl
|
|
||||||
cli.add_command(plotl.elbow)
|
|
||||||
cli.add_command(plotl.link_pca_clusters)
|
|
||||||
|
|
||||||
import plots.classifier as plotc
|
|
||||||
cli.add_command(plotc.pca_with_classes)
|
|
||||||
|
|
||||||
import plots
|
|
||||||
cli.add_command(plots.sentence.sentence_pca)
|
cli.add_command(plots.sentence.sentence_pca)
|
||||||
cli.add_command(plots.sentence.avg_sentence_pca)
|
cli.add_command(plots.sentence.avg_sentence_pca)
|
||||||
cli.add_command(plots.emotion.emotion_over_time)
|
cli.add_command(plots.emotion.emotion_over_time)
|
||||||
cli.add_command(plots.emotion.emotion_regression)
|
cli.add_command(plots.emotion.emotion_regression)
|
||||||
|
|
||||||
cli.add_command(plots.sentiment.over_time)
|
cli.add_command(plots.sentiment.over_time)
|
||||||
cli.add_command(plots.sentiment.bias_over_time)
|
cli.add_command(plots.sentiment.bias_over_time)
|
||||||
cli.add_command(plots.sentiment.bias_vs_recent_winner)
|
cli.add_command(plots.sentiment.bias_vs_recent_winner)
|
||||||
|
cli.add_command(plots.links.elbow)
|
||||||
|
cli.add_command(plots.links.link_pca_clusters)
|
||||||
|
cli.add_command(plots.classifier.pca_with_classes)
|
||||||
|
|
||||||
cli()
|
cli()
|
||||||
|
|
|
@ -1,6 +1,10 @@
|
||||||
import data.main
|
import data.main
|
||||||
import data.scrape
|
import data.scrape
|
||||||
|
import data.factcheck
|
||||||
|
import data.links
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'main'
|
'main'
|
||||||
,'scrape'
|
,'scrape'
|
||||||
|
,'factcheck'
|
||||||
|
,'links'
|
||||||
]
|
]
|
||||||
|
|
|
@ -0,0 +1,171 @@
|
||||||
|
import requests
|
||||||
|
from lxml import etree
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import re
|
||||||
|
from io import BytesIO
|
||||||
|
import pandas as pd
|
||||||
|
from pathlib import Path
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import click
|
||||||
|
from data.main import connect, map_tld, paths
|
||||||
|
from random import randint
|
||||||
|
from time import sleep
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
@click.command('mbfc:parse-index')
|
||||||
|
def parse_index():
|
||||||
|
parser = etree.HTMLParser()
|
||||||
|
publishers = []
|
||||||
|
for page in range(1, 54):
|
||||||
|
url = f"https://mediabiasfactcheck.com/filtered-search/?pg={page}"
|
||||||
|
print(f"downloading {url}", file=sys.stderr)
|
||||||
|
response = requests.get(url)
|
||||||
|
html = response.content
|
||||||
|
tree = etree.parse(BytesIO(html), parser)
|
||||||
|
rows = tree.xpath('//table[@class="mbfc-table"]/tbody/tr')
|
||||||
|
print(f"parsing {len(rows)} rows", file=sys.stderr)
|
||||||
|
for row in rows:
|
||||||
|
publisher = {}
|
||||||
|
link, bias, reporting, country, credibility, media_type, traffic, popularity = tuple(col for col in row.iterchildren())
|
||||||
|
link = link.xpath('./a')[0]
|
||||||
|
publisher['name'] = link.text
|
||||||
|
publisher['detail_url'] = link.get('href')
|
||||||
|
publisher['bias'] = bias.text
|
||||||
|
publisher['reporting'] = reporting.text
|
||||||
|
publisher['country'] = country.text
|
||||||
|
publisher['credibility'] = credibility.text
|
||||||
|
publisher['media_type'] = media_type.text
|
||||||
|
publisher['traffic'] = traffic.text
|
||||||
|
publisher['popularity'] = popularity.xpath('./span')[0].text
|
||||||
|
publishers.append(publisher)
|
||||||
|
df = pd.DataFrame(publishers)
|
||||||
|
save_to = paths('data') / 'mbfc_bias.csv'
|
||||||
|
df.to_csv(save_to, sep='|', index=False)
|
||||||
|
print(f"saved {len(df)}: {save_to}", file=sys.stderr)
|
||||||
|
|
||||||
|
@click.command("mbfc:schema")
|
||||||
|
def schema():
|
||||||
|
with connect() as db:
|
||||||
|
db.sql("""create schema mbfc""")
|
||||||
|
db.sql("""create or replace table mbfc.scrape (
|
||||||
|
url text
|
||||||
|
,scraped_at datetime default now()
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
@click.command("mbfc:scrape")
|
||||||
|
def scrape():
|
||||||
|
|
||||||
|
df = pd.read_csv(paths('data') / 'mbfc_bias.csv', sep="|")
|
||||||
|
|
||||||
|
with connect() as db:
|
||||||
|
stats = db.query("""
|
||||||
|
select
|
||||||
|
count(1) filter(where s.url is not null) as elapsed
|
||||||
|
,count(1) filter(where s.url is null) as remaining
|
||||||
|
from df
|
||||||
|
left join mbfc.scrape s
|
||||||
|
on df.detail_url = s.url
|
||||||
|
""").fetchall()
|
||||||
|
df = db.query("""
|
||||||
|
select
|
||||||
|
detail_url as url
|
||||||
|
from df
|
||||||
|
where df.detail_url not in (
|
||||||
|
select
|
||||||
|
url
|
||||||
|
from mbfc.scrape
|
||||||
|
)
|
||||||
|
""").df()
|
||||||
|
print(f"{stats[0][0]} elapsed. {stats[0][1]} remaining.")
|
||||||
|
|
||||||
|
for url in df.url:
|
||||||
|
delay = randint(1,3)
|
||||||
|
save_as = paths('data') / 'mbfc' / (url.strip('/').split('/')[-1] + '.html')
|
||||||
|
print(f"downloading (delay: {delay}): {url}", file=sys.stderr)
|
||||||
|
sleep(delay)
|
||||||
|
try:
|
||||||
|
response = requests.get(url)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"request failed: {url}", file=sys.stderr)
|
||||||
|
continue
|
||||||
|
with open(save_as, 'w') as f:
|
||||||
|
f.write(response.text)
|
||||||
|
with connect() as db:
|
||||||
|
db.execute("""insert into mbfc.scrape (url) values (?)""", [url])
|
||||||
|
print(f"saved: {save_as}", file=sys.stderr)
|
||||||
|
|
||||||
|
def load():
|
||||||
|
|
||||||
|
publishers = []
|
||||||
|
for i, page in enumerate(tqdm((paths('data') / 'mbfc').iterdir())):
|
||||||
|
publisher = {}
|
||||||
|
publisher['origin_url'] = f"https://mediabiasfactcheck.com/{page.stem}"
|
||||||
|
with page.open() as p:
|
||||||
|
tree = BeautifulSoup(p, 'html.parser')
|
||||||
|
for e in tree(string=re.compile(r'source:', re.IGNORECASE)):
|
||||||
|
e = e.parent
|
||||||
|
while e.name != 'p':
|
||||||
|
e = e.parent
|
||||||
|
l = e.find('a')
|
||||||
|
if l:
|
||||||
|
publisher['tld'] = l.get('href')
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
breakpoint()
|
||||||
|
publishers.append(publisher)
|
||||||
|
df = pd.DataFrame(publishers)
|
||||||
|
df.to_csv(paths('data') / 'mbfc_publisher_url.csv', index=False, sep="|")
|
||||||
|
|
||||||
|
@click.command('mbfc:create-tables')
|
||||||
|
def create_tables():
|
||||||
|
|
||||||
|
pubs = pd.read_csv(paths('data') / 'mbfc_publishers.csv', sep='|')
|
||||||
|
urls = pd.read_csv(paths('data') / 'mbfc_publisher_url.csv', sep="|")
|
||||||
|
df = pubs.merge(urls, on='mbfc_url')
|
||||||
|
df['tld'] = df.tld.apply(map_tld)
|
||||||
|
df['ordinal'] = df.bias.apply(bias_label_to_int)
|
||||||
|
|
||||||
|
with connect() as db:
|
||||||
|
db.sql("""
|
||||||
|
CREATE OR REPLACE TABLE mbfc.publishers AS
|
||||||
|
SELECT
|
||||||
|
row_number() over() as id
|
||||||
|
,p.tld
|
||||||
|
,mode(p.name) as name
|
||||||
|
,mode(p.bias) as bias
|
||||||
|
,mode(p.ordinal) as ordinal
|
||||||
|
,mode(p.reporting) as reporting
|
||||||
|
,mode(p.country) as country
|
||||||
|
,mode(p.credibility) as credibility
|
||||||
|
,mode(p.media_type) as media_type
|
||||||
|
,mode(p.traffic) as traffic
|
||||||
|
,mode(p.popularity) as popularity
|
||||||
|
FROM df p
|
||||||
|
GROUP BY
|
||||||
|
p.tld
|
||||||
|
""")
|
||||||
|
|
||||||
|
with connect() as db:
|
||||||
|
raw_stories = db.sql("""
|
||||||
|
SELECT
|
||||||
|
*
|
||||||
|
FROM stories s
|
||||||
|
""").df()
|
||||||
|
|
||||||
|
stories['tld'] = stories.url.apply(map_tld)
|
||||||
|
|
||||||
|
with connect() as db:
|
||||||
|
db.sql("""
|
||||||
|
CREATE OR REPLACE TABLE mbfc.publisher_stories AS
|
||||||
|
SELECT
|
||||||
|
s.id as story_id
|
||||||
|
,p.id as publisher_id
|
||||||
|
FROM raw_stories s
|
||||||
|
JOIN mbfc.publishers p
|
||||||
|
ON p.tld = s.tld
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,135 @@
|
||||||
|
import click
|
||||||
|
from data.main import connect
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
@click.command('links:create-table')
|
||||||
|
def create_table():
|
||||||
|
|
||||||
|
with connect() as db:
|
||||||
|
db.query(f"""
|
||||||
|
CREATE OR REPLACE TABLE link_edges AS
|
||||||
|
with cte as(
|
||||||
|
SELECT
|
||||||
|
s.publisher_id as parent_id
|
||||||
|
,r.publisher_id as child_id
|
||||||
|
,count(1) as links
|
||||||
|
FROM stories s
|
||||||
|
JOIN related_stories r
|
||||||
|
ON s.id = r.parent_id
|
||||||
|
group by
|
||||||
|
s.publisher_id
|
||||||
|
,r.publisher_id
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
cte.parent_id
|
||||||
|
,cte.child_id
|
||||||
|
,cte.links as links
|
||||||
|
,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
|
||||||
|
,case when cte.links > 0 then 1 else 0 end as onehot
|
||||||
|
FROM cte
|
||||||
|
WHERE cte.child_id in (
|
||||||
|
SELECT
|
||||||
|
distinct parent_id
|
||||||
|
FROM cte
|
||||||
|
)
|
||||||
|
AND cte.parent_id in (
|
||||||
|
SELECT
|
||||||
|
distinct child_id
|
||||||
|
FROM cte
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
db.query("""
|
||||||
|
SELECT
|
||||||
|
*
|
||||||
|
,count(1) over()
|
||||||
|
FROM link_edges e
|
||||||
|
limit 1
|
||||||
|
""")
|
||||||
|
|
||||||
|
print(f"created link_edges")
|
||||||
|
|
||||||
|
@click.command('links:create-pca')
|
||||||
|
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
||||||
|
def create_pca(source):
|
||||||
|
"""create 2D pca labels"""
|
||||||
|
from sklearn.decomposition import PCA
|
||||||
|
|
||||||
|
table_name = f"publisher_pca_{source}"
|
||||||
|
|
||||||
|
with connect() as db:
|
||||||
|
pub = db.query("""
|
||||||
|
SELECT
|
||||||
|
p.*
|
||||||
|
FROM mbfc.publishers p
|
||||||
|
JOIN mbfc.publisher_stories ps
|
||||||
|
ON p.id = ps.publisher_id
|
||||||
|
""").df()
|
||||||
|
df = db.query(f"""
|
||||||
|
SELECT
|
||||||
|
parent_id
|
||||||
|
,child_id
|
||||||
|
,{source} as links
|
||||||
|
FROM link_edges
|
||||||
|
""").df()
|
||||||
|
|
||||||
|
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||||
|
svd = PCA(n_components=2)
|
||||||
|
svd_out = svd.fit_transform(pivot)
|
||||||
|
out = pivot.reset_index()[['parent_id']]
|
||||||
|
out['first'] = svd_out[:, 0]
|
||||||
|
out['second'] = svd_out[:, 1]
|
||||||
|
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
|
||||||
|
|
||||||
|
with connect() as db:
|
||||||
|
db.query(f"""
|
||||||
|
CREATE OR REPLACE TABLE {table_name} AS
|
||||||
|
SELECT
|
||||||
|
out.id as publisher_id
|
||||||
|
,out.first as first
|
||||||
|
,out.second as second
|
||||||
|
FROM out
|
||||||
|
""")
|
||||||
|
|
||||||
|
print(f"created {table_name}")
|
||||||
|
|
||||||
|
|
||||||
|
@click.command('links:create-clusters')
|
||||||
|
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
||||||
|
def create_clusters(source):
|
||||||
|
"""create link adj. matrix clusters table"""
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
|
||||||
|
table_name = f"publisher_clusters_{source}"
|
||||||
|
with connect() as db:
|
||||||
|
df = db.query(f"""
|
||||||
|
SELECT
|
||||||
|
parent_id
|
||||||
|
,child_id
|
||||||
|
,{source} as links
|
||||||
|
FROM link_edges
|
||||||
|
""").df()
|
||||||
|
pub = db.query("""
|
||||||
|
SELECT
|
||||||
|
p.*
|
||||||
|
FROM mbfc.publishers p
|
||||||
|
JOIN mbfc.publisher_stories ps
|
||||||
|
ON ps.publisher_id = p.id
|
||||||
|
""").df()
|
||||||
|
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||||
|
k = 8
|
||||||
|
kmeans = KMeans(n_clusters=k, n_init="auto")
|
||||||
|
pred = kmeans.fit_predict(pivot)
|
||||||
|
out = pivot.reset_index()[['parent_id']]
|
||||||
|
out['label'] = pred
|
||||||
|
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
|
||||||
|
new_table = out[['id', 'label']]
|
||||||
|
with connect() as db:
|
||||||
|
db.query(f"""
|
||||||
|
CREATE OR REPLACE TABLE {table_name} AS
|
||||||
|
SELECT
|
||||||
|
n.id as publisher_id
|
||||||
|
,n.label as label
|
||||||
|
FROM new_table n
|
||||||
|
""")
|
||||||
|
print(f"created {table_name}")
|
|
@ -2,6 +2,10 @@ import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import duckdb
|
import duckdb
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from tld import get_tld
|
||||||
|
from tld.utils import update_tld_names
|
||||||
|
import sys
|
||||||
|
|
||||||
class Data(str, Enum):
|
class Data(str, Enum):
|
||||||
Titles = 'titles'
|
Titles = 'titles'
|
||||||
|
@ -9,6 +13,16 @@ class Data(str, Enum):
|
||||||
def data_dir():
|
def data_dir():
|
||||||
return Path(os.environ['DATA_MINING_DATA_DIR'])
|
return Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||||
|
|
||||||
|
def paths(name='app'):
|
||||||
|
if 'app' in name:
|
||||||
|
return Path(os.environ['DATA_MINING_APP_DIR'])
|
||||||
|
if 'data' in name:
|
||||||
|
return Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||||
|
if 'doc' in name:
|
||||||
|
return Path(os.environ['DATA_MINING_DOCS_DIR'])
|
||||||
|
if 'figure' in name:
|
||||||
|
return Path(os.environ['DATA_MINING_DOCS_DIR']) / 'figures'
|
||||||
|
|
||||||
def connect():
|
def connect():
|
||||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||||
# APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
|
# APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
|
||||||
|
@ -28,3 +42,66 @@ def from_db(t: Data):
|
||||||
limit 100
|
limit 100
|
||||||
""").df()
|
""").df()
|
||||||
return table
|
return table
|
||||||
|
|
||||||
|
def map_tld(x):
|
||||||
|
try:
|
||||||
|
res = get_tld(x, as_object=True)
|
||||||
|
return res.fld
|
||||||
|
except:
|
||||||
|
print(f"'{x}' is not valid.", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def ticklabels():
|
||||||
|
return [
|
||||||
|
'Left',
|
||||||
|
'Left-Center',
|
||||||
|
'Least Biased',
|
||||||
|
'Right-Center',
|
||||||
|
'Right',
|
||||||
|
]
|
||||||
|
|
||||||
|
def bias_label_to_int(rating:str, source: str = 'mbfc') -> int:
|
||||||
|
if source == 'mbfc':
|
||||||
|
mapping = {
|
||||||
|
'Left' : 0,
|
||||||
|
'Left-Center' : 1,
|
||||||
|
'Least Biased' : 2,
|
||||||
|
'Right-Center' : 3,
|
||||||
|
'Right' : 4,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
mapping = {
|
||||||
|
'left' : 0,
|
||||||
|
'left-center' : 1,
|
||||||
|
'center' : 2,
|
||||||
|
'right-center' : 3,
|
||||||
|
'right' : 4,
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
return mapping[rating]
|
||||||
|
except:
|
||||||
|
print(f"no mapping for {rating}", file=sys.stderr)
|
||||||
|
return -1
|
||||||
|
|
||||||
|
def bias_int_to_label(class_id: int, source: str = 'mbfc') -> str:
|
||||||
|
if source == 'mbfc':
|
||||||
|
mapping = {
|
||||||
|
0 : 'Left',
|
||||||
|
1 : 'Left-Center',
|
||||||
|
2 : 'Least Biased',
|
||||||
|
3 : 'Right-Center',
|
||||||
|
4 : 'Right',
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
mapping = {
|
||||||
|
0 : 'left',
|
||||||
|
1 : 'left-center',
|
||||||
|
2 : 'center',
|
||||||
|
3 : 'right-center',
|
||||||
|
4 : 'right',
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
return mapping[class_id]
|
||||||
|
except:
|
||||||
|
print(f"no mapping for {class_id}", file=sys.stderr)
|
||||||
|
return -1
|
||||||
|
|
|
@ -319,12 +319,6 @@ def another_norm():
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
|
||||||
def map_tld(x):
|
|
||||||
try:
|
|
||||||
res = get_tld(x, as_object=True)
|
|
||||||
return res.fld
|
|
||||||
except:
|
|
||||||
return None
|
|
||||||
|
|
||||||
DB.sql("""
|
DB.sql("""
|
||||||
SELECT
|
SELECT
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
from data.main import connect
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
def create_tables():
|
||||||
|
|
||||||
|
with connect() as db:
|
||||||
|
edges = db.query("""
|
||||||
|
select
|
||||||
|
*
|
||||||
|
from link_edges
|
||||||
|
""").df()
|
||||||
|
|
||||||
|
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||||
|
select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
|
||||||
|
|
||||||
|
with connect() as db:
|
||||||
|
db.query("create schema top")
|
||||||
|
|
||||||
|
db.query("""
|
||||||
|
CREATE OR REPLACE TABLE top.publishers AS
|
||||||
|
SELECT
|
||||||
|
p.*
|
||||||
|
FROM publishers p
|
||||||
|
JOIN select_publishers s
|
||||||
|
ON s.publisher_id = p.id
|
||||||
|
""")
|
||||||
|
|
||||||
|
db.query("""
|
||||||
|
CREATE OR REPLACE TABLE top.stories AS
|
||||||
|
SELECT
|
||||||
|
s.*
|
||||||
|
FROM stories s
|
||||||
|
JOIN top.publishers p
|
||||||
|
ON s.publisher_id = p.id
|
||||||
|
WHERE year(s.published_at) >= 2006
|
||||||
|
AND year(s.published_at) < 2023
|
||||||
|
""")
|
||||||
|
|
||||||
|
db.query("""
|
||||||
|
CREATE OR REPLACE TABLE top.related_stories AS
|
||||||
|
SELECT
|
||||||
|
r.*
|
||||||
|
FROM top.stories s
|
||||||
|
JOIN related_stories r
|
||||||
|
ON s.id = r.parent_id
|
||||||
|
""")
|
|
@ -1,10 +1,11 @@
|
||||||
|
import click
|
||||||
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
|
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from data import connect, data_dir
|
from data.main import connect, paths
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import click
|
import pandas as pd
|
||||||
|
|
||||||
@click.option('-c', '--chunks', type=int, default=500, show_default=True)
|
@click.option('-c', '--chunks', type=int, default=500, show_default=True)
|
||||||
@click.command("sentiment:extract")
|
@click.command("sentiment:extract")
|
||||||
|
@ -67,20 +68,19 @@ def extract(chunks):
|
||||||
@click.command('sentiment:load')
|
@click.command('sentiment:load')
|
||||||
def load():
|
def load():
|
||||||
|
|
||||||
DB = connect()
|
sentiments = np.load(paths('data') / 'sentiment.npy')
|
||||||
sentiments = np.load(data_dir() / 'sentiment.npy')
|
story_ids = np.load(paths('data') / 'sentiment_ids.npy')
|
||||||
story_ids = np.load(data_dir() / 'sentiment_ids.npy')
|
|
||||||
data = pd.DataFrame(story_ids, columns=['story_id']).reset_index()
|
data = pd.DataFrame(story_ids, columns=['story_id']).reset_index()
|
||||||
data['sentiment_id'] = sentiments
|
data['sentiment_id'] = sentiments
|
||||||
|
|
||||||
DB.query("""
|
with connect() as db:
|
||||||
CREATE OR REPLACE TABLE top.story_sentiments AS
|
db.query("""
|
||||||
SELECT
|
CREATE OR REPLACE TABLE story_sentiments AS
|
||||||
data.story_id
|
SELECT
|
||||||
,data.sentiment_id as class_id
|
data.story_id
|
||||||
,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label
|
,data.sentiment_id as class_id
|
||||||
FROM data
|
,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label
|
||||||
JOIN top.stories s
|
FROM data
|
||||||
ON s.id = data.story_id
|
JOIN stories s
|
||||||
""")
|
ON s.id = data.story_id
|
||||||
DB.close()
|
""")
|
255
src/links.py
|
@ -1,255 +0,0 @@
|
||||||
import click
|
|
||||||
from data.main import connect
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import seaborn as sns
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
|
|
||||||
|
|
||||||
@click.command('links:create-table')
|
|
||||||
def create_table():
|
|
||||||
|
|
||||||
table_name = "top.link_edges"
|
|
||||||
DB = connect()
|
|
||||||
DB.query(f"""
|
|
||||||
CREATE OR REPLACE TABLE {table_name} AS
|
|
||||||
with cte as(
|
|
||||||
SELECT
|
|
||||||
s.publisher_id as parent_id
|
|
||||||
,r.publisher_id as child_id
|
|
||||||
,count(1) as links
|
|
||||||
FROM top.stories s
|
|
||||||
JOIN top.related_stories r
|
|
||||||
ON s.id = r.parent_id
|
|
||||||
group by
|
|
||||||
s.publisher_id
|
|
||||||
,r.publisher_id
|
|
||||||
)
|
|
||||||
SELECT
|
|
||||||
cte.parent_id
|
|
||||||
,cte.child_id
|
|
||||||
,cte.links as links
|
|
||||||
,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
|
|
||||||
,case when cte.links > 0 then 1 else 0 end as onehot
|
|
||||||
FROM cte
|
|
||||||
WHERE cte.child_id in (
|
|
||||||
SELECT
|
|
||||||
distinct parent_id
|
|
||||||
FROM cte
|
|
||||||
)
|
|
||||||
AND cte.parent_id in (
|
|
||||||
SELECT
|
|
||||||
distinct child_id
|
|
||||||
FROM cte
|
|
||||||
)
|
|
||||||
""")
|
|
||||||
DB.close()
|
|
||||||
|
|
||||||
DB = connect()
|
|
||||||
DB.query("""
|
|
||||||
SELECT
|
|
||||||
*
|
|
||||||
,-log10(links)
|
|
||||||
--distinct parent_id
|
|
||||||
FROM top.link_edges e
|
|
||||||
WHERE e.parent_id = 238
|
|
||||||
""")
|
|
||||||
DB.close()
|
|
||||||
print(f"created {table_name}")
|
|
||||||
|
|
||||||
@click.command('links:create-pca')
|
|
||||||
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
|
||||||
def create_pca(source):
|
|
||||||
"""create 2D pca labels"""
|
|
||||||
|
|
||||||
from sklearn.decomposition import PCA
|
|
||||||
|
|
||||||
table_name = f"top.publisher_pca_{source}"
|
|
||||||
DB = connect()
|
|
||||||
pub = DB.query("""
|
|
||||||
SELECT
|
|
||||||
*
|
|
||||||
FROM top.publishers
|
|
||||||
""").df()
|
|
||||||
df = DB.query(f"""
|
|
||||||
SELECT
|
|
||||||
parent_id
|
|
||||||
,child_id
|
|
||||||
,{source} as links
|
|
||||||
FROM top.link_edges
|
|
||||||
""").df()
|
|
||||||
DB.close()
|
|
||||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
|
||||||
|
|
||||||
svd = PCA(n_components=2)
|
|
||||||
svd_out = svd.fit_transform(pivot)
|
|
||||||
|
|
||||||
out = pivot.reset_index()[['parent_id']]
|
|
||||||
out['first'] = svd_out[:, 0]
|
|
||||||
out['second'] = svd_out[:, 1]
|
|
||||||
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
|
|
||||||
|
|
||||||
DB = connect()
|
|
||||||
DB.query(f"""
|
|
||||||
CREATE OR REPLACE TABLE {table_name} AS
|
|
||||||
SELECT
|
|
||||||
out.id as publisher_id
|
|
||||||
,out.first as first
|
|
||||||
,out.second as second
|
|
||||||
FROM out
|
|
||||||
""")
|
|
||||||
DB.close()
|
|
||||||
print(f"created {table_name}")
|
|
||||||
|
|
||||||
|
|
||||||
@click.command('links:create-clusters')
|
|
||||||
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
|
||||||
def create_clusters(source):
|
|
||||||
from sklearn.cluster import KMeans
|
|
||||||
|
|
||||||
table_name = f"top.publisher_clusters_{source}"
|
|
||||||
DB = connect()
|
|
||||||
df = DB.query(f"""
|
|
||||||
SELECT
|
|
||||||
parent_id
|
|
||||||
,child_id
|
|
||||||
,{source} as links
|
|
||||||
FROM top.link_edges
|
|
||||||
""").df()
|
|
||||||
pub = DB.query("""
|
|
||||||
SELECT
|
|
||||||
*
|
|
||||||
FROM top.publishers
|
|
||||||
""").df()
|
|
||||||
DB.close()
|
|
||||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
|
||||||
|
|
||||||
|
|
||||||
k = 8
|
|
||||||
kmeans = KMeans(n_clusters=k, n_init="auto")
|
|
||||||
pred = kmeans.fit_predict(pivot)
|
|
||||||
out = pivot.reset_index()[['parent_id']]
|
|
||||||
out['label'] = pred
|
|
||||||
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
|
|
||||||
new_table = out[['id', 'label']]
|
|
||||||
|
|
||||||
DB = connect()
|
|
||||||
DB.query(f"""
|
|
||||||
CREATE OR REPLACE TABLE {table_name} AS
|
|
||||||
SELECT
|
|
||||||
n.id as publisher_id
|
|
||||||
,n.label as label
|
|
||||||
FROM new_table n
|
|
||||||
""")
|
|
||||||
DB.close()
|
|
||||||
print(f"created {table_name}")
|
|
||||||
|
|
||||||
def to_matrix():
|
|
||||||
"""returns an adjacency matrix of publishers to publisher link frequency"""
|
|
||||||
|
|
||||||
DB = connect()
|
|
||||||
|
|
||||||
bias_map = pd.DataFrame([
|
|
||||||
{'label' :'left', 'value' : 0},
|
|
||||||
{'label' :'left-center', 'value' : 1},
|
|
||||||
{'label' :'center', 'value' : 2},
|
|
||||||
{'label' :'right-center', 'value' : 3},
|
|
||||||
{'label' :'right', 'value' : 4},
|
|
||||||
{'label' :'allsides', 'value' : -1},
|
|
||||||
])
|
|
||||||
|
|
||||||
bias = DB.sql("""
|
|
||||||
SELECT
|
|
||||||
b.id
|
|
||||||
,b.label
|
|
||||||
,m.value
|
|
||||||
FROM publisher_bias b
|
|
||||||
JOIN bias_map m
|
|
||||||
ON b.label = m.label
|
|
||||||
WHERE value != -1
|
|
||||||
""").df()
|
|
||||||
|
|
||||||
pub = DB.sql("""
|
|
||||||
select
|
|
||||||
p.id
|
|
||||||
,p.name
|
|
||||||
,p.url
|
|
||||||
from publishers p
|
|
||||||
""").df()
|
|
||||||
|
|
||||||
edges = DB.sql("""
|
|
||||||
WITH total as (
|
|
||||||
SELECT
|
|
||||||
s.publisher_id as id
|
|
||||||
,COUNT(1) as stories
|
|
||||||
FROM stories s
|
|
||||||
GROUP BY
|
|
||||||
s.publisher_id
|
|
||||||
), p as (
|
|
||||||
SELECT
|
|
||||||
p.id
|
|
||||||
,stories
|
|
||||||
FROM publishers p
|
|
||||||
LEFT JOIN total t
|
|
||||||
ON t.id = p.id
|
|
||||||
WHERE t.stories >= 20
|
|
||||||
), cte as (
|
|
||||||
SELECT
|
|
||||||
r.publisher_id as child_id
|
|
||||||
,s.publisher_id as parent_id
|
|
||||||
,count(1) as links
|
|
||||||
FROM related_stories r
|
|
||||||
JOIN stories s
|
|
||||||
ON s.id = r.parent_id
|
|
||||||
group by
|
|
||||||
s.publisher_id
|
|
||||||
,r.publisher_id
|
|
||||||
)
|
|
||||||
SELECT
|
|
||||||
p.id as parent_id
|
|
||||||
,cte.child_id
|
|
||||||
,links
|
|
||||||
FROM p
|
|
||||||
left JOIN cte
|
|
||||||
ON p.id = cte.parent_id
|
|
||||||
""").df()
|
|
||||||
|
|
||||||
# only keep values that have more than 1 link
|
|
||||||
test = edges[edges['links'] > 2].pivot(index='parent_id', columns='child_id', values='links').fillna(0).reset_index()
|
|
||||||
edges.dropna().pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
|
||||||
pd.merge(adj, pub, how='left', left_on='parent_id', right_on='id')
|
|
||||||
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
|
||||||
adj.values.shape
|
|
||||||
|
|
||||||
|
|
||||||
out = pd.DataFrame(adj.index.values, columns=['id'])
|
|
||||||
out = pd.merge(out, pub, how='left', on='id')
|
|
||||||
return out
|
|
||||||
|
|
||||||
@click.command('links:analysis')
|
|
||||||
def analysis():
|
|
||||||
from sklearn.decomposition import PCA, TruncatedSVD
|
|
||||||
from sklearn.cluster import MiniBatchKMeans
|
|
||||||
adj = to_matrix()
|
|
||||||
pca = PCA(n_components=4)
|
|
||||||
pca_out = pca.fit_transform(adj)
|
|
||||||
|
|
||||||
svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
|
|
||||||
svd_out = svd.fit_transform(adj)
|
|
||||||
|
|
||||||
x = svd_out[:, 0]
|
|
||||||
y = svd_out[:, 1]
|
|
||||||
|
|
||||||
x = pca_out[:, 0]
|
|
||||||
y = pca_out[:, 1]
|
|
||||||
sns.scatterplot(x=x, y=y)
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
kmeans = MiniBatchKMeans(n_clusters=5, random_state=0, batch_size=6, n_init="auto")
|
|
||||||
pred = kmeans.fit_predict(pca_out)
|
|
||||||
|
|
||||||
sns.scatterplot(x=x, y=y, hue=pred)
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
sns.scatterplot(x=pub['first'], y=pub['second'], hue=pub['bias'])
|
|
||||||
plt.show()
|
|
|
@ -1,6 +1,5 @@
|
||||||
from data.main import data_dir, connect
|
from data.main import connect, paths
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import sklearn
|
|
||||||
from sklearn.cluster import MiniBatchKMeans
|
from sklearn.cluster import MiniBatchKMeans
|
||||||
import click
|
import click
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -11,7 +10,7 @@ from enum import Enum, auto
|
||||||
|
|
||||||
@click.command(name="mine:embeddings")
|
@click.command(name="mine:embeddings")
|
||||||
def embeddings():
|
def embeddings():
|
||||||
data = np.load(data_dir() / "embeddings.npy")
|
data = np.load(paths('data') / "embeddings.npy")
|
||||||
kmeans = MiniBatchKMeans(n_clusters=5,
|
kmeans = MiniBatchKMeans(n_clusters=5,
|
||||||
random_state=0,
|
random_state=0,
|
||||||
batch_size=6,
|
batch_size=6,
|
||||||
|
@ -76,7 +75,7 @@ class PlotName(str, Enum):
|
||||||
@click.option('-n', '--name', required=True, type=click.Choice(PlotName))
|
@click.option('-n', '--name', required=True, type=click.Choice(PlotName))
|
||||||
@click.option('-o', '--output', required=False, type=click.Path())
|
@click.option('-o', '--output', required=False, type=click.Path())
|
||||||
def plot(name: PlotName, output: Path):
|
def plot(name: PlotName, output: Path):
|
||||||
output = output if output else APP_DIR / f'docs/{name}.png'
|
output = output if output else paths('figures') / f'{name}.png'
|
||||||
if name == PlotName.TitleLength:
|
if name == PlotName.TitleLength:
|
||||||
fig, ax = plt.subplots(1,1)
|
fig, ax = plt.subplots(1,1)
|
||||||
data = db.sql("""
|
data = db.sql("""
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
from data.main import connect, map_tld
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def normalize():
|
||||||
|
with connect() as db:
|
||||||
|
db.sql("""
|
||||||
|
SELECT
|
||||||
|
p.name
|
||||||
|
,count(1) as ctn
|
||||||
|
,sum(ctn) over() as all
|
||||||
|
FROM mbfc.publishers p
|
||||||
|
JOIN mbfc.publisher_stories ps
|
||||||
|
ON ps.publisher_id = p.id
|
||||||
|
JOIN stories s
|
||||||
|
ON s.id = ps.story_id
|
||||||
|
GROUP BY
|
||||||
|
p.name
|
||||||
|
""")
|
||||||
|
|
||||||
|
with connect() as db:
|
||||||
|
db.sql("""
|
||||||
|
SELECT
|
||||||
|
bias
|
||||||
|
,count(distinct p.id) as publishers
|
||||||
|
,count(1) as stories
|
||||||
|
,count(1) / count(distinct p.id) as ratio
|
||||||
|
FROM mbfc.publishers p
|
||||||
|
JOIN mbfc.publisher_stories ps
|
||||||
|
ON ps.publisher_id = p.id
|
||||||
|
JOIN stories s
|
||||||
|
ON s.id = ps.story_id
|
||||||
|
GROUP BY
|
||||||
|
p.bias
|
||||||
|
ORDER BY count(1)
|
||||||
|
""")
|
|
@ -1,9 +1,13 @@
|
||||||
import plots.sentence
|
import plots.sentence
|
||||||
import plots.emotion
|
import plots.emotion
|
||||||
import plots.sentiment
|
import plots.sentiment
|
||||||
|
import plots.links
|
||||||
|
import plots.classifier
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'sentence'
|
'sentence'
|
||||||
'emotion',
|
'emotion',
|
||||||
'sentiment',
|
'sentiment',
|
||||||
|
'links',
|
||||||
|
'classifier',
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import click
|
import click
|
||||||
from data.main import connect
|
from data.main import connect, bias_label_to_int, ticklabels
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
|
@ -7,54 +7,53 @@ import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
|
|
||||||
|
|
||||||
@click.command('plot:bias-hist')
|
@click.command('plot:bias-hist')
|
||||||
def hist():
|
def hist():
|
||||||
filename = "bias_hist.png"
|
save_to = paths('figures') / "bias_hist.png"
|
||||||
|
|
||||||
|
with connect() as db:
|
||||||
|
data = db.sql("""
|
||||||
|
SELECT
|
||||||
|
p.ordinal
|
||||||
|
,count(1) as stories
|
||||||
|
FROM stories s
|
||||||
|
JOIN mbfc.publisher_stories ps
|
||||||
|
ON s.id = ps.story_id
|
||||||
|
JOIN mbfc.publishers p
|
||||||
|
ON ps.publisher_id = p.id
|
||||||
|
WHERE ordinal != -1
|
||||||
|
GROUP BY
|
||||||
|
p.ordinal
|
||||||
|
""").df()
|
||||||
|
|
||||||
DB = connect()
|
|
||||||
data = DB.sql("""
|
|
||||||
SELECT
|
|
||||||
b.ordinal
|
|
||||||
,count(1) as stories
|
|
||||||
FROM stories s
|
|
||||||
JOIN publisher_bias pb
|
|
||||||
ON pb.publisher_id = s.publisher_id
|
|
||||||
JOIN bias_ratings b
|
|
||||||
ON b.id = pb.bias_id
|
|
||||||
GROUP BY
|
|
||||||
b.ordinal
|
|
||||||
""").df()
|
|
||||||
DB.close()
|
|
||||||
|
|
||||||
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
|
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
|
||||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
|
||||||
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
|
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.savefig(out_path / filename)
|
plt.savefig(save_to)
|
||||||
print(f"saved: {filename}")
|
plt.close()
|
||||||
|
print(f"saved: {save_to}")
|
||||||
|
|
||||||
@click.command('plot:bias-publisher-hist')
|
@click.command('plot:bias-publisher-hist')
|
||||||
def publisher_hist():
|
def publisher_hist():
|
||||||
filename = "bias_publisher_hist.png"
|
save_to = paths('figures') / "bias_publisher_hist.png"
|
||||||
|
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
data = DB.sql("""
|
data = db.sql("""
|
||||||
SELECT
|
SELECT
|
||||||
b.ordinal
|
p.ordinal
|
||||||
,count(1) as publishers
|
,count(distinct p.id) as publishers
|
||||||
FROM publisher_bias pb
|
FROM mbfc.publishers p
|
||||||
JOIN bias_ratings b
|
JOIN mbfc.publisher_stories ps
|
||||||
ON b.id = pb.bias_id
|
ON ps.publisher_id = p.id
|
||||||
GROUP BY
|
WHERE ordinal != -1
|
||||||
b.ordinal
|
GROUP BY
|
||||||
""").df()
|
p.ordinal
|
||||||
DB.close()
|
""").df()
|
||||||
|
|
||||||
ax = sns.barplot(x=data['ordinal'], y=data['publishers'], color='tab:blue')
|
ax = sns.barplot(x=data['ordinal'], y=data['publishers'], color='tab:blue')
|
||||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels())
|
||||||
ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels)
|
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.savefig(out_path / filename)
|
plt.savefig(save_to)
|
||||||
plt.close()
|
plt.close()
|
||||||
print(f"saved: {filename}")
|
print(f"saved: {save_to}")
|
||||||
|
|
|
@ -5,30 +5,32 @@ import seaborn as sns
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
|
|
||||||
|
|
||||||
@click.command('plot:pca-with-classes')
|
@click.command('plot:pca-with-classes')
|
||||||
def pca_with_classes():
|
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
||||||
filename = "pca_with_classes.png"
|
def pca_with_classes(source):
|
||||||
|
|
||||||
DB = connect()
|
save_to = paths('figures') / f"link_{source}_pca_with_classes.png"
|
||||||
data = DB.query(f"""
|
|
||||||
SELECT
|
with connect() as db:
|
||||||
p.tld
|
df = db.query(f"""
|
||||||
,b.bias
|
SELECT
|
||||||
,c.first
|
p.tld
|
||||||
,c.second
|
,p.bias
|
||||||
,round(cast(b.agree as float) / (b.agree + b.disagree), 2) ratio
|
,c.first
|
||||||
FROM top.publishers p
|
,c.second
|
||||||
JOIN top.publisher_bias pb
|
--,round(cast(b.agree as float) / (b.agree + b.disagree), 2) ratio
|
||||||
ON p.id = pb.publisher_id
|
FROM mbfc.publishers p
|
||||||
JOIN bias_ratings b
|
JOIN publisher_pca_{source} c
|
||||||
ON b.id = pb.bias_id
|
ON c.publisher_id = p.id
|
||||||
JOIN top.publisher_pca_normalized c
|
WHERE p.ordinal != -1
|
||||||
ON c.publisher_id = p.id
|
ORDER BY p.ordinal
|
||||||
""").df()
|
""").df()
|
||||||
DB.close()
|
|
||||||
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['bias'], s=100)
|
ax = sns.relplot(df, x='first', y='second', hue='bias', col='bias', s=100, palette='rainbow')
|
||||||
ax.set(title="pca components vs. bias labels", xlabel="first pca component", ylabel="second pca component")
|
ax.set(xlabel="first pca component",
|
||||||
plt.savefig(out_dir / filename)
|
ylabel="second pca component")
|
||||||
print(f"saved: {filename}")
|
ax.figure.suptitle="pca components vs. bias labels"
|
||||||
|
plt.savefig(save_to)
|
||||||
|
plt.close()
|
||||||
|
print(f"saved: {save_to}")
|
||||||
|
os.system(f'xdg-open {save_to}')
|
||||||
|
|
|
@ -1,169 +1,190 @@
|
||||||
import click
|
import click
|
||||||
from data.main import connect
|
from data.main import connect, paths
|
||||||
import os
|
import os
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
|
|
||||||
|
|
||||||
@click.command('plot:articles-per-year')
|
@click.command('plot:articles-per-year')
|
||||||
def articles_per_year():
|
def articles_per_year():
|
||||||
filename = 'articles_per_year.png'
|
save_to = paths('figures') / 'articles_per_year.png'
|
||||||
|
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
data = DB.query("""
|
data = DB.query("""
|
||||||
select
|
select
|
||||||
year(published_at) as year
|
year(published_at) as year
|
||||||
,count(1) as stories
|
,count(1) as stories
|
||||||
from stories
|
from stories
|
||||||
group by
|
group by
|
||||||
year(published_at)
|
year(published_at)
|
||||||
""").df()
|
""").df()
|
||||||
DB.close()
|
|
||||||
|
|
||||||
ax = sns.barplot(x=data.year, y=data.stories, color='tab:blue')
|
ax = sns.barplot(x=data.year, y=data.stories, color='tab:blue')
|
||||||
ax.tick_params(axis='x', rotation=90)
|
ax.tick_params(axis='x', rotation=90)
|
||||||
ax.set(title="count of articles per year", ylabel="count of stories (#)")
|
ax.set(title="count of articles per year", ylabel="count of stories (#)")
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.savefig(out_dir / filename)
|
plt.savefig(save_to)
|
||||||
|
print(f"saved: {save_to}")
|
||||||
|
|
||||||
@click.command('plot:distinct-publishers')
|
@click.command('plot:distinct-publishers')
|
||||||
def distinct_publishers():
|
def distinct_publishers():
|
||||||
filename = 'distinct_publishers.png'
|
save_to = paths('figures') / 'distinct_publishers.png'
|
||||||
|
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
data = DB.query("""
|
data = DB.query("""
|
||||||
select
|
select
|
||||||
year(published_at) as year
|
year(published_at) as year
|
||||||
,count(distinct publisher_id) as publishers
|
,count(distinct publisher_id) as publishers
|
||||||
from stories
|
from stories
|
||||||
group by
|
group by
|
||||||
year(published_at)
|
year(published_at)
|
||||||
""").df()
|
""").df()
|
||||||
DB.close()
|
|
||||||
|
|
||||||
ax = sns.barplot(x=data.year, y=data.publishers, color='tab:blue')
|
ax = sns.barplot(x=data.year, y=data.publishers, color='tab:blue')
|
||||||
ax.tick_params(axis='x', rotation=90)
|
ax.tick_params(axis='x', rotation=90)
|
||||||
ax.set(title="count of publishers per year", ylabel="count of publishers (#)")
|
ax.set(title="count of publishers per year", ylabel="count of publishers (#)")
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.savefig(out_dir / filename)
|
plt.savefig(save_to)
|
||||||
plt.close()
|
plt.close()
|
||||||
|
print(f"saved: {save_to}")
|
||||||
|
|
||||||
@click.command('plot:stories-per-publisher')
|
@click.command('plot:stories-per-publisher')
|
||||||
def stories_per_publisher():
|
def stories_per_publisher():
|
||||||
filename = 'stories_per_publisher.png'
|
save_to = paths('figures') / 'stories_per_publisher.png'
|
||||||
|
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
data = DB.query("""
|
data = db.query("""
|
||||||
with cte as (
|
with cte as (
|
||||||
select
|
|
||||||
publisher_id
|
|
||||||
,year(published_at) as year
|
|
||||||
,count(1) as stories
|
|
||||||
from stories
|
|
||||||
group by
|
|
||||||
publisher_id
|
|
||||||
,year(published_at)
|
|
||||||
) , agg as (
|
|
||||||
select
|
select
|
||||||
publisher_id
|
ps.publisher_id
|
||||||
,avg(stories) as stories_per_year
|
,year(s.published_at) as year
|
||||||
,case
|
,count(1) as stories
|
||||||
when avg(stories) < 2 then 2
|
from stories s
|
||||||
when avg(stories) < 4 then 4
|
join mbfc.publisher_stories ps
|
||||||
when avg(stories) < 8 then 8
|
on ps.story_id = s.id
|
||||||
when avg(stories) < 16 then 16
|
|
||||||
when avg(stories) < 32 then 32
|
|
||||||
when avg(stories) < 64 then 64
|
|
||||||
when avg(stories) < 128 then 128
|
|
||||||
else 129
|
|
||||||
end as max_avg
|
|
||||||
from cte
|
|
||||||
group by
|
group by
|
||||||
publisher_id
|
ps.publisher_id
|
||||||
)
|
,year(s.published_at)
|
||||||
select
|
) , agg as (
|
||||||
max_avg
|
select
|
||||||
,count(1) as publishers
|
publisher_id
|
||||||
from agg
|
,avg(stories) as stories_per_year
|
||||||
group by
|
,case
|
||||||
max_avg
|
when avg(stories) < 2 then 2
|
||||||
""").df()
|
when avg(stories) < 4 then 4
|
||||||
DB.close()
|
when avg(stories) < 8 then 8
|
||||||
|
when avg(stories) < 16 then 16
|
||||||
|
when avg(stories) < 32 then 32
|
||||||
|
when avg(stories) < 64 then 64
|
||||||
|
when avg(stories) < 128 then 128
|
||||||
|
else 129
|
||||||
|
end as max_avg
|
||||||
|
from cte
|
||||||
|
group by
|
||||||
|
publisher_id
|
||||||
|
)
|
||||||
|
select
|
||||||
|
max_avg
|
||||||
|
,count(1) as publishers
|
||||||
|
from agg
|
||||||
|
group by
|
||||||
|
max_avg
|
||||||
|
""").df()
|
||||||
|
|
||||||
ax = sns.barplot(x=data.max_avg, y=data.publishers, color='tab:blue')
|
ax = sns.barplot(x=data.max_avg, y=data.publishers, color='tab:blue')
|
||||||
ax.set(title="histogram of publisher stories per year", ylabel="count of publishers (#)", xlabel="max average stories / year")
|
ax.set(title="histogram of publisher stories per year", ylabel="count of publishers (#)", xlabel="avg. stories / year", xticklabels=['2', '4', '8', '16', '32', '64', '128', '>128'])
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.savefig(out_dir / filename)
|
plt.savefig(save_to)
|
||||||
plt.close()
|
plt.close()
|
||||||
|
print(f"saved: {save_to}")
|
||||||
|
|
||||||
|
|
||||||
@click.command('plot:top-publishers')
|
@click.command('plot:top-publishers')
|
||||||
def top_publishers():
|
def top_publishers():
|
||||||
"""plot top publishers over time"""
|
"""plot top publishers over time"""
|
||||||
|
|
||||||
filename = 'top_publishers.png'
|
save_to = paths('figures') / 'top_publishers.png'
|
||||||
|
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
data = DB.query("""
|
db.query("""
|
||||||
select
|
SELECT
|
||||||
p.tld
|
p.tld
|
||||||
,year(published_at) as year
|
,p.id
|
||||||
,count(1) as stories
|
FROM mbfc.publishers p
|
||||||
from (
|
JOIN mbfc.publisher_stories ps
|
||||||
select
|
ON ps.publisher_id = p.id
|
||||||
|
JOIN stories s
|
||||||
|
ON s.id = ps.story_id
|
||||||
|
GROUP BY
|
||||||
|
p.tld
|
||||||
|
,p.id
|
||||||
|
order by count(1) desc
|
||||||
|
limit 20
|
||||||
|
""")
|
||||||
|
|
||||||
|
with connect() as db:
|
||||||
|
data = db.query("""
|
||||||
|
WITH p as (
|
||||||
|
SELECT
|
||||||
|
p.tld
|
||||||
|
,p.id
|
||||||
|
FROM mbfc.publishers p
|
||||||
|
JOIN mbfc.publisher_stories ps
|
||||||
|
ON ps.publisher_id = p.id
|
||||||
|
JOIN stories s
|
||||||
|
ON s.id = ps.story_id
|
||||||
|
GROUP BY
|
||||||
|
p.tld
|
||||||
|
,p.id
|
||||||
|
order by count(1) desc
|
||||||
|
limit 20
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
p.tld
|
p.tld
|
||||||
,p.id
|
,YEAR(s.published_at) AS year
|
||||||
from top.publishers p
|
,COUNT(1) AS stories
|
||||||
join top.stories s
|
FROM stories s
|
||||||
on s.publisher_id = p.id
|
JOIN mbfc.publisher_stories ps
|
||||||
group by
|
ON ps.story_id = s.id
|
||||||
|
JOIN p
|
||||||
|
ON p.id = ps.publisher_id
|
||||||
|
GROUP by
|
||||||
p.tld
|
p.tld
|
||||||
,p.id
|
,YEAR(published_at)
|
||||||
order by count(1) desc
|
ORDER BY year, COUNT(DISTINCT s.id) DESC
|
||||||
limit 20
|
""").df()
|
||||||
) p
|
|
||||||
join top.stories s
|
|
||||||
on s.publisher_id = p.id
|
|
||||||
group by
|
|
||||||
p.tld
|
|
||||||
,year(published_at)
|
|
||||||
order by count(distinct s.id) desc
|
|
||||||
""").df()
|
|
||||||
DB.close()
|
|
||||||
|
|
||||||
pivot = data.pivot(columns='year', index='tld', values='stories')
|
pivot = data.pivot(columns='year', index='tld', values='stories')
|
||||||
ax = sns.heatmap(pivot, cmap="crest")
|
ax = sns.heatmap(pivot, cmap="crest")
|
||||||
ax.set(title="top 20 publishers (by tld)", ylabel="tld", xlabel="stories / year (#)")
|
ax.set(title="top 20 publishers (by tld)", ylabel="tld", xlabel="stories / year (#)")
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.savefig(out_dir / filename)
|
plt.savefig(save_to)
|
||||||
plt.close()
|
plt.close()
|
||||||
|
print(f"saved: {save_to}")
|
||||||
|
|
||||||
|
|
||||||
@click.command('plot:common_tld')
|
@click.command('plot:common_tld')
|
||||||
def common_tld():
|
def common_tld():
|
||||||
import dataframe_image as dfi
|
import dataframe_image as dfi
|
||||||
filename = 'common_tld.png'
|
save_to = paths('figures') / 'common_tld.png'
|
||||||
|
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
data = DB.query("""
|
data = db.query("""
|
||||||
select
|
select
|
||||||
split_part(url, '.', -1) as tld
|
split_part(url, '.', -1) as tld
|
||||||
,count(1) as publishers
|
,count(1) as publishers
|
||||||
,case when count(1) < 20
|
,case when count(1) < 20
|
||||||
then string_agg(distinct url, '\t')
|
then string_agg(distinct url, '\t')
|
||||||
else NULL
|
else NULL
|
||||||
end as urls
|
end as urls
|
||||||
from publishers
|
from publishers
|
||||||
group by
|
group by
|
||||||
split_part(url, '.', -1)
|
split_part(url, '.', -1)
|
||||||
order by
|
order by
|
||||||
count(1) desc
|
count(1) desc
|
||||||
""").df()
|
""").df()
|
||||||
DB.close()
|
data[:15][['tld', 'publishers']].style.hide(axis="index").export_png(save_to, table_conversion='matplotlib')
|
||||||
data[:15][['tld', 'publishers']].style.hide(axis="index").export_png(out_dir / filename, table_conversion='matplotlib')
|
|
||||||
|
|
||||||
def stats():
|
def stats():
|
||||||
|
|
||||||
|
@ -246,7 +267,7 @@ def stats():
|
||||||
@click.command('plot:bias-stats')
|
@click.command('plot:bias-stats')
|
||||||
def bias_stats():
|
def bias_stats():
|
||||||
import dataframe_image as dfi
|
import dataframe_image as dfi
|
||||||
filename = 'bias_stats.png'
|
save_to = paths('figures') / 'bias_stats.png'
|
||||||
|
|
||||||
DB = connect()
|
DB = connect()
|
||||||
|
|
||||||
|
@ -300,3 +321,69 @@ def bias_stats():
|
||||||
""").df()
|
""").df()
|
||||||
DB.close()
|
DB.close()
|
||||||
print(df.to_markdown(index=False))
|
print(df.to_markdown(index=False))
|
||||||
|
|
||||||
|
@click.command('plot:bias-over-time')
|
||||||
|
def bias_over_time():
|
||||||
|
"""plot bias labels over time"""
|
||||||
|
|
||||||
|
save_to = paths('figures') / 'bias_over_time.png'
|
||||||
|
|
||||||
|
with connect() as db:
|
||||||
|
df = db.sql("""
|
||||||
|
SELECT
|
||||||
|
p.bias
|
||||||
|
,p.id
|
||||||
|
,date_trunc('year', s.published_at) as year
|
||||||
|
,count(1) as stories
|
||||||
|
FROM stories s
|
||||||
|
JOIN mbfc.publisher_stories ps
|
||||||
|
ON ps.story_id = s.id
|
||||||
|
JOIN mbfc.publishers p
|
||||||
|
ON p.id = ps.publisher_id
|
||||||
|
where year(s.published_at) not in (2006, 2023)
|
||||||
|
and p.ordinal != -1
|
||||||
|
GROUP BY
|
||||||
|
p.bias
|
||||||
|
,p.id
|
||||||
|
,p.ordinal
|
||||||
|
,date_trunc('year', s.published_at)
|
||||||
|
order by
|
||||||
|
p.ordinal
|
||||||
|
,date_trunc('year', s.published_at)
|
||||||
|
""").df()
|
||||||
|
|
||||||
|
ax = sns.relplot(df, kind='line', x='year', y='stories', col='bias', units='id', estimator=None, palette='rainbow')
|
||||||
|
ax.set(ylabel="stories", xlabel="year")
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig(save_to)
|
||||||
|
plt.close()
|
||||||
|
print(f"saved: {save_to}")
|
||||||
|
|
||||||
|
def bias_missing():
|
||||||
|
|
||||||
|
with connect() as db:
|
||||||
|
df = db.sql("""
|
||||||
|
SELECT
|
||||||
|
date_trunc('year', s.published_at) as year
|
||||||
|
,s.tld
|
||||||
|
,count(1) as stories
|
||||||
|
FROM stories s
|
||||||
|
LEFT JOIN mbfc.publisher_stories ps
|
||||||
|
ON ps.story_id = s.id
|
||||||
|
WHERE ps.publisher_id is NULL
|
||||||
|
AND year(s.published_at) not in (2006, 2023)
|
||||||
|
GROUP BY
|
||||||
|
s.tld
|
||||||
|
,date_trunc('year', s.published_at)
|
||||||
|
HAVING count(1) > 10
|
||||||
|
ORDER BY
|
||||||
|
date_trunc('year', s.published_at)
|
||||||
|
""").df()
|
||||||
|
|
||||||
|
ax = sns.lineplot(df, x='year', y='stories', units='tld', estimator=None)
|
||||||
|
ax.set(ylabel="stories", xlabel="year")
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.show()
|
||||||
|
#plt.savefig(save_to)
|
||||||
|
plt.close()
|
||||||
|
#print(f"saved: {save_to}")
|
||||||
|
|
|
@ -1,77 +1,79 @@
|
||||||
import click
|
import click
|
||||||
from data.main import connect
|
from data.main import connect, paths, ticklabels
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
|
|
||||||
|
|
||||||
@click.command('plot:emotion-over-time')
|
@click.command('plot:emotion-over-time')
|
||||||
def emotion_over_time():
|
def emotion_over_time():
|
||||||
filename = "emotion_over_time.png"
|
|
||||||
DB = connect()
|
|
||||||
|
|
||||||
emotions = DB.sql("""
|
filename = "emotion_over_time.png"
|
||||||
SELECT
|
save_to = paths('figures') / filename
|
||||||
date_trunc('year', s.published_at) AS year
|
|
||||||
,e.label AS emotion
|
with connect() as db:
|
||||||
,count(1) AS stories
|
emotions = db.sql("""
|
||||||
FROM top.stories s
|
SELECT
|
||||||
JOIN story_emotions se
|
date_trunc('year', s.published_at) AS year
|
||||||
ON s.id = se.story_id
|
,e.label AS emotion
|
||||||
JOIN emotions e
|
,count(1) AS stories
|
||||||
ON e.id = se.emotion_id
|
FROM stories s
|
||||||
GROUP by
|
JOIN story_emotions se
|
||||||
date_trunc('year', s.published_at)
|
ON s.id = se.story_id
|
||||||
,e.label
|
JOIN emotions e
|
||||||
""").df()
|
ON e.id = se.emotion_id
|
||||||
DB.close()
|
GROUP by
|
||||||
|
date_trunc('year', s.published_at)
|
||||||
|
,e.label
|
||||||
|
""").df()
|
||||||
|
|
||||||
ax = sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
|
ax = sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
|
||||||
ax.set(title="title emotions over years", xlabel="year", ylabel="stories (#)")
|
ax.set(title="title emotions over years", xlabel="year", ylabel="stories (#)")
|
||||||
plt.savefig(out_path / filename)
|
plt.savefig(save_to)
|
||||||
print(f"saved: {filename}")
|
plt.close()
|
||||||
|
print(f"saved: {save_to}")
|
||||||
|
os.system(f'xdg-open {save_to}')
|
||||||
|
|
||||||
@click.command('plot:emotion-regression')
|
@click.command('plot:emotion-regression')
|
||||||
def emotion_regression():
|
def emotion_regression():
|
||||||
|
"""plot emotion over time as regression"""
|
||||||
|
|
||||||
from sklearn import linear_model
|
from sklearn import linear_model
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
from sklearn.metrics import ConfusionMatrixDisplay
|
from sklearn.metrics import ConfusionMatrixDisplay
|
||||||
|
|
||||||
filename = "emotion_regression.png"
|
filename = "emotion_regression.png"
|
||||||
|
save_to = paths('figures') / filename
|
||||||
|
|
||||||
|
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
emotions = DB.query("""
|
#emotions = db.query("""
|
||||||
SELECT
|
# SELECT
|
||||||
label
|
# label
|
||||||
FROM emotions e
|
# FROM emotions e
|
||||||
""").df()['label'].to_list()
|
#""").df()['label'].to_list()
|
||||||
DB.close()
|
df = db.sql(f"""
|
||||||
|
SELECT
|
||||||
DB = connect()
|
epoch(date_trunc('yearweek', s.published_at)) AS date
|
||||||
df = DB.sql(f"""
|
,e.id AS emotion_id
|
||||||
SELECT
|
,p.id as publisher_id
|
||||||
epoch(date_trunc('yearweek', s.published_at)) AS date
|
,count(1) AS stories
|
||||||
,e.id AS emotion_id
|
FROM stories s
|
||||||
,p.id as publisher_id
|
JOIN mbfc.publisher_stories ps
|
||||||
,count(1) AS stories
|
ON ps.story_id = s.id
|
||||||
FROM top.stories s
|
JOIN mbfc.publishers p
|
||||||
JOIN top.publishers p
|
ON p.id = ps.publisher_id
|
||||||
ON p.id = s.publisher_id
|
JOIN story_emotions se
|
||||||
JOIN story_emotions se
|
ON s.id = se.story_id
|
||||||
ON s.id = se.story_id
|
JOIN emotions e
|
||||||
JOIN emotions e
|
ON e.id = se.emotion_id
|
||||||
ON e.id = se.emotion_id
|
WHERE p.ordinal != -1
|
||||||
GROUP by
|
GROUP by
|
||||||
epoch(date_trunc('yearweek', s.published_at))
|
epoch(date_trunc('yearweek', s.published_at))
|
||||||
,p.id
|
,p.id
|
||||||
,e.id
|
,e.id
|
||||||
""").df()
|
""").df()
|
||||||
DB.close()
|
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for (emotion_id, publisher_id), group in df.groupby(['emotion_id', 'publisher_id']):
|
for (emotion_id, publisher_id), group in df.groupby(['emotion_id', 'publisher_id']):
|
||||||
|
@ -83,77 +85,59 @@ def emotion_regression():
|
||||||
results.append({'emotion_id' : emotion_id, 'publisher_id':publisher_id, 'per_year' : per_year})
|
results.append({'emotion_id' : emotion_id, 'publisher_id':publisher_id, 'per_year' : per_year})
|
||||||
results = pd.DataFrame(results)
|
results = pd.DataFrame(results)
|
||||||
|
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
out = DB.query("""
|
out = db.query("""
|
||||||
SELECT
|
SELECT
|
||||||
e.label as emotion
|
e.label as emotion
|
||||||
--,p.tld
|
,avg(results.per_year) as avg_reg_coef
|
||||||
,avg(results.per_year) as avg_reg_coef
|
,p.bias
|
||||||
,b.ordinal
|
FROM results
|
||||||
FROM results
|
JOIN emotions e
|
||||||
JOIN emotions e
|
ON e.id = results.emotion_id
|
||||||
ON e.id = results.emotion_id
|
JOIN mbfc.publishers p
|
||||||
JOIN top.publishers p
|
ON p.id = results.publisher_id
|
||||||
ON p.id = results.publisher_id
|
GROUP BY
|
||||||
JOIN publisher_bias pb
|
e.label
|
||||||
ON pb.publisher_id = results.publisher_id
|
,p.bias
|
||||||
JOIN bias_ratings b
|
""").df()
|
||||||
ON b.id = pb.bias_id
|
|
||||||
GROUP BY
|
|
||||||
e.label
|
|
||||||
,b.ordinal
|
|
||||||
""").df()
|
|
||||||
DB.close()
|
|
||||||
pivot = out.pivot(index=['emotion'], columns=['ordinal'], values=['avg_reg_coef'])
|
|
||||||
|
|
||||||
ax = sns.heatmap(pivot, cmap='RdBu_r')
|
pivot = out.pivot(index=['emotion'], columns=['bias'], values=['avg_reg_coef'])
|
||||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
|
||||||
|
ax = sns.heatmap(pivot, cmap='BrBG', vmin=-0.01, vmax=0.01, center=0)
|
||||||
|
#ax = sns.heatmap(pivot, cmap='RdBu_r', center=0)
|
||||||
ax.set(title="slope of regression (stories/year) by bias and emotion"
|
ax.set(title="slope of regression (stories/year) by bias and emotion"
|
||||||
,xticklabels=ticklabels
|
,xticklabels=ticklabels()
|
||||||
,xlabel="bias"
|
,xlabel="bias"
|
||||||
,ylabel="emotion")
|
,ylabel="emotion")
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.savefig(out_path / filename)
|
plt.savefig(save_to)
|
||||||
print(f"saved: {filename}")
|
plt.close()
|
||||||
|
print(f"saved: {save_to}")
|
||||||
|
|
||||||
@click.command('plot:emotion-hist')
|
@click.command('plot:emotion-hist')
|
||||||
def emotion_hist():
|
def emotion_hist():
|
||||||
|
|
||||||
filename = "emotion_hist.png"
|
filename = "emotion_hist.png"
|
||||||
|
save_to = paths('figures') / filename
|
||||||
|
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
DB.query("""describe story_emotions""")
|
data = db.sql("""
|
||||||
|
SELECT
|
||||||
|
p.bias
|
||||||
|
,count(1) as stories
|
||||||
|
FROM stories s
|
||||||
|
JOIN mbfc.publisher_stories ps
|
||||||
|
ON ps.story_id = s.id
|
||||||
|
JOIN mbfc.publishers p
|
||||||
|
ON p.id = ps.publisher_id
|
||||||
|
WHERE p.ordinal != -1
|
||||||
|
GROUP BY
|
||||||
|
p.bias
|
||||||
|
""").df()
|
||||||
|
|
||||||
DB.query("""
|
ax = sns.barplot(data, x='bias', y='stories', palette='rainbow', order=ticklabels())
|
||||||
select
|
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
|
||||||
e.label
|
|
||||||
,count(distinct s.id) as stories
|
|
||||||
,count(distinct s.publisher_id) as publishers
|
|
||||||
from story_emotions se
|
|
||||||
join emotions e
|
|
||||||
on e.id = se.emotion_id
|
|
||||||
join top.stories s
|
|
||||||
on s.id = se.story_id
|
|
||||||
group by
|
|
||||||
e.label
|
|
||||||
""").df().to_markdown(index=False)
|
|
||||||
|
|
||||||
data = DB.sql("""
|
|
||||||
SELECT
|
|
||||||
b.ordinal
|
|
||||||
,count(1) as stories
|
|
||||||
FROM stories s
|
|
||||||
JOIN publisher_bias pb
|
|
||||||
ON pb.publisher_id = s.publisher_id
|
|
||||||
JOIN bias_ratings b
|
|
||||||
ON b.id = pb.bias_id
|
|
||||||
GROUP BY
|
|
||||||
b.ordinal
|
|
||||||
""").df()
|
|
||||||
DB.close()
|
|
||||||
|
|
||||||
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
|
|
||||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
|
||||||
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
|
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.savefig(out_path / filename)
|
plt.savefig(save_to)
|
||||||
print(f"saved: {filename}")
|
plt.close()
|
||||||
|
print(f"saved: {save_to}")
|
||||||
|
|
|
@ -9,20 +9,20 @@ import numpy as np
|
||||||
from sklearn.metrics import silhouette_score
|
from sklearn.metrics import silhouette_score
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
|
|
||||||
|
|
||||||
@click.command('plot:link-elbow')
|
@click.command('plot:link-elbow')
|
||||||
def elbow():
|
def elbow():
|
||||||
from sklearn.cluster import KMeans
|
from sklearn.cluster import KMeans
|
||||||
|
|
||||||
filename = 'link_cluster_elbow.png'
|
save_to = paths('figures') / 'link_cluster_elbow.png'
|
||||||
|
|
||||||
|
with connect() as db:
|
||||||
|
df = db.query("""
|
||||||
|
SELECT
|
||||||
|
*
|
||||||
|
FROM link_edges
|
||||||
|
""").df()
|
||||||
|
|
||||||
DB = connect()
|
|
||||||
df = DB.query("""
|
|
||||||
SELECT
|
|
||||||
*
|
|
||||||
FROM link_edges
|
|
||||||
""").df()
|
|
||||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||||
|
|
||||||
to_plot = []
|
to_plot = []
|
||||||
|
@ -36,8 +36,9 @@ def elbow():
|
||||||
|
|
||||||
ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia)
|
ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia)
|
||||||
ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
|
ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
|
||||||
plt.savefig(out_dir / filename)
|
plt.savefig(save_to)
|
||||||
plt.close()
|
plt.close()
|
||||||
|
print(f"saved plot: {save_to}")
|
||||||
|
|
||||||
# randomly pick 8
|
# randomly pick 8
|
||||||
|
|
||||||
|
@ -45,72 +46,65 @@ def elbow():
|
||||||
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
||||||
def link_pca_clusters(source):
|
def link_pca_clusters(source):
|
||||||
|
|
||||||
filename = f"link_pca_clusters_{source}.png"
|
save_to = paths('figures') / f"link_pca_clusters_{source}.png"
|
||||||
|
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
df = DB.query(f"""
|
df = db.query(f"""
|
||||||
SELECT
|
SELECT
|
||||||
c.label as cluster
|
c.label as cluster
|
||||||
,p.tld
|
,p.tld
|
||||||
--,b.label as bias
|
--,b.label as bias
|
||||||
,pca.first
|
,pca.first
|
||||||
,pca.second
|
,pca.second
|
||||||
,s.cnt as stories
|
,s.cnt as stories
|
||||||
FROM top.publisher_clusters_{source} c
|
FROM top.publisher_clusters_{source} c
|
||||||
JOIN top.publishers p
|
JOIN top.publishers p
|
||||||
ON c.publisher_id = p.id
|
ON c.publisher_id = p.id
|
||||||
JOIN
|
JOIN
|
||||||
(
|
(
|
||||||
select
|
select
|
||||||
s.publisher_id
|
s.publisher_id
|
||||||
,count(1) as cnt
|
,count(1) as cnt
|
||||||
FROM top.stories s
|
FROM top.stories s
|
||||||
GROUP BY
|
GROUP BY
|
||||||
s.publisher_id
|
s.publisher_id
|
||||||
) s
|
) s
|
||||||
ON s.publisher_id = p.id
|
ON s.publisher_id = p.id
|
||||||
JOIN top.publisher_pca_{source} pca
|
JOIN top.publisher_pca_{source} pca
|
||||||
ON pca.publisher_id = p.id
|
ON pca.publisher_id = p.id
|
||||||
""").df()
|
""").df()
|
||||||
DB.close()
|
|
||||||
|
|
||||||
ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster'])
|
ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster'])
|
||||||
ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component")
|
ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component")
|
||||||
plt.savefig(out_dir / filename)
|
plt.savefig(save_to)
|
||||||
|
print(f"saved plot: {save_to}")
|
||||||
# .df().groupby(['cluster', 'bias']).describe()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test():
|
def test():
|
||||||
data_dir = Path(os.getenv('DATA_MINING_DATA_DIR'))
|
|
||||||
|
|
||||||
DB.query("""
|
with connect() as db:
|
||||||
SELECT
|
db.query("""
|
||||||
p.id as publisher_id
|
SELECT
|
||||||
,p.name
|
p.id as publisher_id
|
||||||
,p.tld
|
,p.name
|
||||||
,cast(b.bias_id as int) as bias_id
|
,p.tld
|
||||||
,count(1) as stories
|
,cast(b.bias_id as int) as bias_id
|
||||||
FROM publishers p
|
,count(1) as stories
|
||||||
JOIN stories s
|
FROM publishers p
|
||||||
ON s.publisher_id = p.id
|
JOIN stories s
|
||||||
JOIN publisher_clusters c
|
ON s.publisher_id = p.id
|
||||||
ON c.publisher_id = p.id
|
JOIN publisher_clusters c
|
||||||
LEFT JOIN publisher_bias b
|
ON c.publisher_id = p.id
|
||||||
ON b.publisher_id = p.id
|
LEFT JOIN publisher_bias b
|
||||||
where bias_id is null
|
ON b.publisher_id = p.id
|
||||||
group by
|
where bias_id is null
|
||||||
p.id
|
group by
|
||||||
,p.name
|
p.id
|
||||||
,p.tld
|
,p.name
|
||||||
,b.bias_id
|
,p.tld
|
||||||
ORDER BY count(1) desc
|
,b.bias_id
|
||||||
""")
|
ORDER BY count(1) desc
|
||||||
|
""")
|
||||||
# .df().to_csv(data_dir / 'cluster_publishers.csv', sep="|", index=False)
|
|
||||||
DB.close()
|
|
||||||
|
|
||||||
|
|
||||||
@click.command('plot:link-confusion')
|
@click.command('plot:link-confusion')
|
||||||
|
@ -120,34 +114,36 @@ def link_confusion():
|
||||||
from sklearn.metrics import ConfusionMatrixDisplay
|
from sklearn.metrics import ConfusionMatrixDisplay
|
||||||
|
|
||||||
filename = "link_confusion.png"
|
filename = "link_confusion.png"
|
||||||
|
save_to = paths('figures') / filename
|
||||||
|
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
bias = DB.query("""
|
bias = db.query("""
|
||||||
SELECT
|
SELECT
|
||||||
p.id as publisher_id
|
p.id as publisher_id
|
||||||
,b.ordinal
|
,b.ordinal
|
||||||
FROM top.publishers p
|
FROM top.publishers p
|
||||||
JOIN top.publisher_bias pb
|
JOIN top.publisher_bias pb
|
||||||
ON pb.publisher_id = p.id
|
ON pb.publisher_id = p.id
|
||||||
JOIN bias_ratings b
|
JOIN bias_ratings b
|
||||||
ON b.id = pb.bias_id
|
ON b.id = pb.bias_id
|
||||||
""").df()
|
""").df()
|
||||||
|
|
||||||
|
df = db.query("""
|
||||||
|
SELECT
|
||||||
|
*
|
||||||
|
FROM top.link_edges
|
||||||
|
WHERE parent_id in (
|
||||||
|
select
|
||||||
|
publisher_id
|
||||||
|
from bias
|
||||||
|
)
|
||||||
|
AND child_id in (
|
||||||
|
select
|
||||||
|
publisher_id
|
||||||
|
from bias
|
||||||
|
)
|
||||||
|
""").df()
|
||||||
|
|
||||||
df = DB.query("""
|
|
||||||
SELECT
|
|
||||||
*
|
|
||||||
FROM top.link_edges
|
|
||||||
WHERE parent_id in (
|
|
||||||
select
|
|
||||||
publisher_id
|
|
||||||
from bias
|
|
||||||
)
|
|
||||||
AND child_id in (
|
|
||||||
select
|
|
||||||
publisher_id
|
|
||||||
from bias
|
|
||||||
)
|
|
||||||
""").df()
|
|
||||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||||
|
|
||||||
x = pivot.values
|
x = pivot.values
|
||||||
|
@ -166,9 +162,9 @@ def link_confusion():
|
||||||
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
|
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
|
||||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||||
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
|
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
|
||||||
plt.savefig(out_dir / filename)
|
plt.savefig(save_to)
|
||||||
plt.close()
|
plt.close()
|
||||||
print(f"saved plot: {filename}")
|
print(f"saved plot: {save_to}")
|
||||||
|
|
||||||
@click.command('plot:link-classifier')
|
@click.command('plot:link-classifier')
|
||||||
def link_confusion():
|
def link_confusion():
|
||||||
|
@ -176,49 +172,51 @@ def link_confusion():
|
||||||
from sklearn.neighbors import KNeighborsClassifier
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
from sklearn.metrics import ConfusionMatrixDisplay
|
from sklearn.metrics import ConfusionMatrixDisplay
|
||||||
|
|
||||||
filename = "link_confusion.png"
|
save_to = paths('figures') / "link_confusion.png"
|
||||||
|
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
bias = DB.query("""
|
bias = db.query("""
|
||||||
SELECT
|
SELECT
|
||||||
p.id as publisher_id
|
p.id as publisher_id
|
||||||
,b.ordinal
|
,b.ordinal
|
||||||
FROM top.publishers p
|
FROM top.publishers p
|
||||||
JOIN top.publisher_bias pb
|
JOIN top.publisher_bias pb
|
||||||
ON pb.publisher_id = p.id
|
ON pb.publisher_id = p.id
|
||||||
JOIN bias_ratings b
|
JOIN bias_ratings b
|
||||||
ON b.id = pb.bias_id
|
ON b.id = pb.bias_id
|
||||||
""").df()
|
""").df()
|
||||||
|
|
||||||
|
df = db.query("""
|
||||||
|
SELECT
|
||||||
|
*
|
||||||
|
FROM top.link_edges
|
||||||
|
WHERE parent_id in (
|
||||||
|
select
|
||||||
|
publisher_id
|
||||||
|
from bias
|
||||||
|
)
|
||||||
|
AND child_id in (
|
||||||
|
select
|
||||||
|
publisher_id
|
||||||
|
from bias
|
||||||
|
)
|
||||||
|
""").df()
|
||||||
|
|
||||||
df = DB.query("""
|
|
||||||
SELECT
|
|
||||||
*
|
|
||||||
FROM top.link_edges
|
|
||||||
WHERE parent_id in (
|
|
||||||
select
|
|
||||||
publisher_id
|
|
||||||
from bias
|
|
||||||
)
|
|
||||||
AND child_id in (
|
|
||||||
select
|
|
||||||
publisher_id
|
|
||||||
from bias
|
|
||||||
)
|
|
||||||
""").df()
|
|
||||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||||
|
|
||||||
x = pivot.values
|
x = pivot.values
|
||||||
y = bias.sort_values('publisher_id').ordinal
|
y = bias.sort_values('publisher_id').ordinal
|
||||||
|
|
||||||
data = DB.query(f"""
|
with connect() as db:
|
||||||
SELECT
|
data = db.query(f"""
|
||||||
p.id as publisher_id
|
SELECT
|
||||||
,pca.first
|
p.id as publisher_id
|
||||||
,pca.second
|
,pca.first
|
||||||
FROM top.publisher_pca_onehot pca
|
,pca.second
|
||||||
JOIN top.publishers p
|
FROM top.publisher_pca_onehot pca
|
||||||
ON pca.publisher_id = p.id
|
JOIN top.publishers p
|
||||||
""").df()
|
ON pca.publisher_id = p.id
|
||||||
|
""").df()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -235,11 +233,11 @@ def link_confusion():
|
||||||
ConfusionMatrixDisplay.from_predictions(data['ordinal'], data['pred'], ax=ax)
|
ConfusionMatrixDisplay.from_predictions(data['ordinal'], data['pred'], ax=ax)
|
||||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||||
ax.set(title="confusion matrix for link matrix kNN classifier", xticklabels=ticklabels, yticklabels=ticklabels)
|
ax.set(title="confusion matrix for link matrix kNN classifier", xticklabels=ticklabels, yticklabels=ticklabels)
|
||||||
plt.savefig(out_dir / filename)
|
plt.savefig(save_to)
|
||||||
plt.close()
|
plt.close()
|
||||||
print(f"saved plot: {filename}")
|
print(f"saved plot: {save_to}")
|
||||||
|
|
||||||
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
|
# ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
|
||||||
plt.savefig(out_dir / filename)
|
# plt.savefig(out_dir / filename)
|
||||||
plt.close()
|
# plt.close()
|
||||||
print(f"saved plot: {filename}")
|
# print(f"saved plot: {filename}")
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import click
|
import click
|
||||||
from data.main import connect
|
from data.main import connect, paths
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
|
@ -7,57 +7,52 @@ import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
|
|
||||||
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
|
|
||||||
|
|
||||||
@click.command('plot:sentence-pca')
|
@click.command('plot:sentence-pca')
|
||||||
def sentence_pca():
|
def sentence_pca():
|
||||||
filename = "embedding_sentence_pca.png"
|
save_to = paths('figures') / "embedding_sentence_pca.png"
|
||||||
DB = connect()
|
|
||||||
|
|
||||||
data = DB.query("""
|
with connect() as db:
|
||||||
SELECT
|
data = db.query("""
|
||||||
pca.first
|
SELECT
|
||||||
,pca.second
|
pca.first
|
||||||
,b.bias as label
|
,pca.second
|
||||||
FROM top.story_embeddings_pca pca
|
,b.bias as label
|
||||||
JOIN top.stories s
|
FROM top.story_embeddings_pca pca
|
||||||
ON s.id = pca.story_id
|
JOIN top.stories s
|
||||||
JOIN top.publisher_bias pb
|
ON s.id = pca.story_id
|
||||||
ON pb.publisher_id = s.publisher_id
|
JOIN top.publisher_bias pb
|
||||||
JOIN bias_ratings b
|
ON pb.publisher_id = s.publisher_id
|
||||||
ON b.id = pb.bias_id
|
JOIN bias_ratings b
|
||||||
""").df()
|
ON b.id = pb.bias_id
|
||||||
DB.close()
|
""").df()
|
||||||
|
|
||||||
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
|
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
|
||||||
ax.set(title="pca components vs. bias label", xlabel="first component", ylabel="second component")
|
ax.set(title="pca components vs. bias label", xlabel="first component", ylabel="second component")
|
||||||
plt.savefig(out_path / filename)
|
plt.savefig(save_to)
|
||||||
|
|
||||||
@click.command('plot:avg-sentence-pca')
|
@click.command('plot:avg-sentence-pca')
|
||||||
def avg_sentence_pca():
|
def avg_sentence_pca():
|
||||||
filename = "avg_embedding_sentence_pca.png"
|
save_to = paths('figures') / "avg_embedding_sentence_pca.png"
|
||||||
DB = connect()
|
|
||||||
|
|
||||||
data = DB.query("""
|
with connect() as db:
|
||||||
SELECT
|
data = db.query("""
|
||||||
pca.first
|
SELECT
|
||||||
,pca.second
|
pca.first
|
||||||
,p.tld
|
,pca.second
|
||||||
,b.bias as label
|
,p.tld
|
||||||
FROM top.publisher_embeddings_pca pca
|
,b.bias as label
|
||||||
JOIN top.publishers p
|
FROM top.publisher_embeddings_pca pca
|
||||||
ON p.id = pca.publisher_id
|
JOIN top.publishers p
|
||||||
JOIN top.publisher_bias pb
|
ON p.id = pca.publisher_id
|
||||||
ON pb.publisher_id = p.id
|
JOIN top.publisher_bias pb
|
||||||
JOIN bias_ratings b
|
ON pb.publisher_id = p.id
|
||||||
ON b.id = pb.bias_id
|
JOIN bias_ratings b
|
||||||
""").df()
|
ON b.id = pb.bias_id
|
||||||
DB.close()
|
""").df()
|
||||||
|
|
||||||
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
|
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
|
||||||
ax.set(title="avg. publisher embedding pca components vs. bias label", xlabel="first component", ylabel="second component")
|
ax.set(title="avg. publisher embedding pca components vs. bias label", xlabel="first component", ylabel="second component")
|
||||||
plt.savefig(out_path / filename)
|
plt.savefig(save_to)
|
||||||
|
|
||||||
@click.command('plot:sentence-confusion')
|
@click.command('plot:sentence-confusion')
|
||||||
def sentence_confusion():
|
def sentence_confusion():
|
||||||
|
@ -65,32 +60,31 @@ def sentence_confusion():
|
||||||
from sklearn.neighbors import KNeighborsClassifier
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
from sklearn.metrics import ConfusionMatrixDisplay
|
from sklearn.metrics import ConfusionMatrixDisplay
|
||||||
|
|
||||||
filename = "sentence_confusion.png"
|
save_to = paths('figures') / "sentence_confusion.png"
|
||||||
|
|
||||||
embeddings = np.load(data_path / 'embeddings.npy')
|
embeddings = np.load(paths('data') / 'embeddings.npy')
|
||||||
embedding_ids = np.load(data_path / 'embedding_ids.npy')
|
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
|
||||||
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
||||||
|
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
data = DB.query("""
|
data = db.query("""
|
||||||
SELECT
|
SELECT
|
||||||
ids.index
|
ids.index
|
||||||
,s.id
|
,s.id
|
||||||
,b.ordinal
|
,b.ordinal
|
||||||
FROM ids
|
FROM ids
|
||||||
JOIN top.stories s
|
JOIN top.stories s
|
||||||
ON ids.story_id = s.id
|
ON ids.story_id = s.id
|
||||||
JOIN top.publisher_bias pb
|
JOIN top.publisher_bias pb
|
||||||
ON pb.publisher_id = s.publisher_id
|
ON pb.publisher_id = s.publisher_id
|
||||||
JOIN bias_ratings b
|
JOIN bias_ratings b
|
||||||
ON b.id = pb.bias_id
|
ON b.id = pb.bias_id
|
||||||
""").df()
|
""").df()
|
||||||
pub = DB.query("""
|
pub = db.query("""
|
||||||
SELECT
|
SELECT
|
||||||
*
|
*
|
||||||
FROM top.publishers
|
FROM top.publishers
|
||||||
""").df()
|
""").df()
|
||||||
DB.close()
|
|
||||||
|
|
||||||
train, test = train_test_split(data)
|
train, test = train_test_split(data)
|
||||||
train_x, train_y = embeddings[train['index']], train['ordinal']
|
train_x, train_y = embeddings[train['index']], train['ordinal']
|
||||||
|
@ -105,7 +99,7 @@ def sentence_confusion():
|
||||||
ConfusionMatrixDisplay.from_predictions(test_y, pred, ax=ax)
|
ConfusionMatrixDisplay.from_predictions(test_y, pred, ax=ax)
|
||||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||||
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
|
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
|
||||||
plt.savefig(out_path / filename)
|
plt.savefig(save_to)
|
||||||
plt.close()
|
plt.close()
|
||||||
|
|
||||||
print(f"saved plot: {filename}")
|
print(f"saved plot: {save_to}")
|
||||||
|
|
|
@ -1,138 +1,135 @@
|
||||||
import click
|
import click
|
||||||
from data.main import connect
|
from data.main import connect, paths, ticklabels
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
|
|
||||||
|
|
||||||
@click.command('plot:sentiment-over-time')
|
@click.command('plot:sentiment-over-time')
|
||||||
def over_time():
|
def over_time():
|
||||||
filename = "sentiment_over_time.png"
|
|
||||||
|
|
||||||
DB = connect()
|
filename = "sentiment_over_time.png"
|
||||||
data = DB.sql("""
|
save_to = paths('figures') / filename
|
||||||
SELECT
|
|
||||||
avg(sent.class_id) as sentiment
|
with connect() as db:
|
||||||
,s.published_at as date
|
data = db.sql("""
|
||||||
FROM top.story_sentiments sent
|
SELECT
|
||||||
JOIN top.stories s
|
avg(sent.class_id) as sentiment
|
||||||
ON s.id = sent.story_id
|
,s.published_at as date
|
||||||
GROUP BY
|
FROM top.story_sentiments sent
|
||||||
s.published_at
|
JOIN top.stories s
|
||||||
""").df()
|
ON s.id = sent.story_id
|
||||||
DB.close()
|
GROUP BY
|
||||||
|
s.published_at
|
||||||
|
""").df()
|
||||||
|
|
||||||
ax = sns.scatterplot(x=data['date'], y=data['sentiment'])
|
ax = sns.scatterplot(x=data['date'], y=data['sentiment'])
|
||||||
ax.set(title="sentiment vs. time")
|
ax.set(title="sentiment vs. time")
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.savefig(out_path / filename)
|
plt.savefig(save_to)
|
||||||
print(f"saved: {filename}")
|
plt.close()
|
||||||
|
print(f"saved: {save_to}")
|
||||||
|
|
||||||
@click.command('plot:bias-vs-sentiment-over-time')
|
@click.command('plot:bias-vs-sentiment-over-time')
|
||||||
def bias_over_time():
|
def bias_over_time():
|
||||||
|
"""plot sentiment/bias vs. time"""
|
||||||
|
|
||||||
filename = "bias_vs_sentiment_over_time.png"
|
filename = "bias_vs_sentiment_over_time.png"
|
||||||
|
save_to = paths('figures') / filename
|
||||||
|
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
data = DB.sql("""
|
data = db.sql("""
|
||||||
SELECT
|
with cte as (
|
||||||
avg(sent.class_id) as sentiment
|
SELECT
|
||||||
,date_trunc('yearweek', s.published_at) as date
|
avg(sent.class_id) as sentiment
|
||||||
--,b.ordinal as ordinal
|
,date_trunc('yearweek', s.published_at) as date
|
||||||
,b.bias
|
,p.bias
|
||||||
FROM top.story_sentiments sent
|
FROM story_sentiments sent
|
||||||
JOIN top.stories s
|
JOIN stories s
|
||||||
ON s.id = sent.story_id
|
ON s.id = sent.story_id
|
||||||
JOIN publisher_bias pb
|
JOIN mbfc.publisher_stories ps
|
||||||
ON pb.publisher_id = s.publisher_id
|
ON ps.story_id = s.id
|
||||||
JOIN bias_ratings b
|
JOIN mbfc.publishers p
|
||||||
ON b.id = pb.bias_id
|
ON p.id = ps.publisher_id
|
||||||
GROUP BY
|
WHERE p.ordinal != -1
|
||||||
date_trunc('yearweek', s.published_at)
|
GROUP BY
|
||||||
,b.bias
|
date_trunc('yearweek', s.published_at)
|
||||||
""").df()
|
,p.bias
|
||||||
DB.close()
|
)
|
||||||
|
SELECT
|
||||||
|
median(sentiment) OVER (PARTITION BY bias ORDER BY date DESC ROWS BETWEEN 0 PRECEDING AND 7 FOLLOWING) as sentiment
|
||||||
|
,date
|
||||||
|
,bias
|
||||||
|
FROM cte
|
||||||
|
WHERE year(date) not in (2005, 2023)
|
||||||
|
""").df()
|
||||||
|
|
||||||
order = ['left', 'left-center', 'center', 'right-center', 'right']
|
#ax = sns.relplot(data, x='date', y='sentiment', col='bias', palette='rainbow', hue='bias', col_order=ticklabels())
|
||||||
ax = sns.relplot(data, x='date', y='sentiment', col='bias', col_order=order)
|
ax = sns.lineplot(data, x='date', y='sentiment', palette='rainbow', hue='bias', hue_order=ticklabels())
|
||||||
|
plt.axhline(y=0.5, color='black', linestyle='--', label='neutral')
|
||||||
|
ax.set(title='sentiment and bias vs. time', ylabel='8 week rolling avg. sentiment', xlabel='date')
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.savefig(out_path / filename)
|
plt.savefig(save_to)
|
||||||
plt.close()
|
plt.close()
|
||||||
print(f"saved: {filename}")
|
print(f"saved: {save_to}")
|
||||||
|
|
||||||
@click.command('plot:sentiment-recent-winner')
|
@click.command('plot:sentiment-recent-winner')
|
||||||
def bias_vs_recent_winner():
|
def bias_vs_recent_winner():
|
||||||
|
"""plot bias vs. distance to election"""
|
||||||
|
|
||||||
filename = "bias_vs_recent_winner.png"
|
filename = "bias_vs_recent_winner.png"
|
||||||
|
save_to = paths('figures') / filename
|
||||||
|
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
data = DB.sql("""
|
data = db.sql("""
|
||||||
SELECT
|
SELECT
|
||||||
e.days_away as days_away
|
round(e.days_away, -1) as days_away
|
||||||
,b.ordinal
|
,p.bias
|
||||||
,avg(sent.class_id) as sentiment
|
,avg(sent.class_id) as sentiment
|
||||||
,count(1) as stories
|
,count(1) as stories
|
||||||
FROM top.stories s
|
FROM stories s
|
||||||
JOIN top.story_sentiments sent
|
JOIN story_sentiments sent
|
||||||
ON s.id = sent.story_id
|
ON s.id = sent.story_id
|
||||||
JOIN election_distance e
|
JOIN election_distance e
|
||||||
ON e.publish_date = s.published_at
|
ON e.publish_date = s.published_at
|
||||||
JOIN publisher_bias pb
|
JOIN mbfc.publisher_stories ps
|
||||||
ON pb.publisher_id = s.publisher_id
|
ON ps.story_id = s.id
|
||||||
JOIN bias_ratings b
|
JOIN mbfc.publishers p
|
||||||
ON b.id = pb.bias_id
|
ON p.id = ps.publisher_id
|
||||||
GROUP BY
|
GROUP BY
|
||||||
e.days_away
|
round(e.days_away, -1)
|
||||||
,b.ordinal
|
,p.bias
|
||||||
""").df()
|
""").df()
|
||||||
DB.close()
|
|
||||||
data
|
|
||||||
|
|
||||||
ax = sns.scatterplot(x=data['days_away'], y=data['sentiment'], hue=data['ordinal'])
|
ax = sns.scatterplot(data, x='days_away', y='sentiment', hue='bias', hue_order=ticklabels(), palette='rainbow')
|
||||||
ax.set(title="sentiment vs. days to nearest election", xlabel="days to nearest election", ylabel="average title seniment")
|
ax.set(title="sentiment vs. days to nearest election", xlabel="days to nearest election", ylabel="average title seniment")
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.savefig(out_path / filename)
|
plt.savefig(save_to)
|
||||||
plt.close()
|
plt.close()
|
||||||
|
print(f"saved: {save_to}")
|
||||||
print(f"saved: {filename}")
|
|
||||||
|
|
||||||
@click.command('plot:sentiment-hist')
|
@click.command('plot:sentiment-hist')
|
||||||
def sentiment_hist():
|
def sentiment_hist():
|
||||||
|
|
||||||
filename = "sentiment_hist.png"
|
filename = "sentiment_hist.png"
|
||||||
|
save_to = paths('figures') / filename
|
||||||
|
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
|
data = db.sql("""
|
||||||
|
SELECT
|
||||||
|
p.bias
|
||||||
|
,count(1) as stories
|
||||||
|
FROM stories s
|
||||||
|
JOIN mbfc.publisher_stories ps
|
||||||
|
ON ps.story_id = s.id
|
||||||
|
JOIN mbfc.publishers p
|
||||||
|
ON p.id = ps.publisher_id
|
||||||
|
WHERE p.ordinal != -1
|
||||||
|
GROUP BY
|
||||||
|
p.bias
|
||||||
|
""").df()
|
||||||
|
|
||||||
DB.query("""
|
ax = sns.barplot(data, x='bias', y='stories', hue='bias', palette='rainbow')
|
||||||
select
|
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels())
|
||||||
sent.label
|
|
||||||
,count(distinct s.id) as stories
|
|
||||||
,count(distinct s.publisher_id) as publishers
|
|
||||||
from top.story_sentiments sent
|
|
||||||
join top.stories s
|
|
||||||
on s.id = sent.story_id
|
|
||||||
group by
|
|
||||||
sent.label
|
|
||||||
""").df().to_markdown(index=False)
|
|
||||||
|
|
||||||
data = DB.sql("""
|
|
||||||
SELECT
|
|
||||||
b.ordinal
|
|
||||||
,count(1) as stories
|
|
||||||
FROM stories s
|
|
||||||
JOIN publisher_bias pb
|
|
||||||
ON pb.publisher_id = s.publisher_id
|
|
||||||
JOIN bias_ratings b
|
|
||||||
ON b.id = pb.bias_id
|
|
||||||
GROUP BY
|
|
||||||
b.ordinal
|
|
||||||
""").df()
|
|
||||||
DB.close()
|
|
||||||
|
|
||||||
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
|
|
||||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
|
||||||
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
|
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.savefig(out_path / filename)
|
plt.savefig(save_to)
|
||||||
print(f"saved: {filename}")
|
plt.close()
|
||||||
|
print(f"saved: {save_to}")
|
||||||
|
|
|
@ -1,48 +0,0 @@
|
||||||
from data.main import connect
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
DB = connect()
|
|
||||||
edges = DB.query("""
|
|
||||||
select
|
|
||||||
*
|
|
||||||
from link_edges
|
|
||||||
""").df()
|
|
||||||
DB.close()
|
|
||||||
|
|
||||||
edges
|
|
||||||
|
|
||||||
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
|
||||||
select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
|
|
||||||
|
|
||||||
DB = connect()
|
|
||||||
DB.query("create schema top")
|
|
||||||
|
|
||||||
DB.query("""
|
|
||||||
CREATE OR REPLACE TABLE top.publishers AS
|
|
||||||
SELECT
|
|
||||||
p.*
|
|
||||||
FROM publishers p
|
|
||||||
JOIN select_publishers s
|
|
||||||
ON s.publisher_id = p.id
|
|
||||||
""")
|
|
||||||
|
|
||||||
DB.query("""
|
|
||||||
CREATE OR REPLACE TABLE top.stories AS
|
|
||||||
SELECT
|
|
||||||
s.*
|
|
||||||
FROM stories s
|
|
||||||
JOIN top.publishers p
|
|
||||||
ON s.publisher_id = p.id
|
|
||||||
WHERE year(s.published_at) >= 2006
|
|
||||||
AND year(s.published_at) < 2023
|
|
||||||
""")
|
|
||||||
|
|
||||||
DB.query("""
|
|
||||||
CREATE OR REPLACE TABLE top.related_stories AS
|
|
||||||
SELECT
|
|
||||||
r.*
|
|
||||||
FROM top.stories s
|
|
||||||
JOIN related_stories r
|
|
||||||
ON s.id = r.parent_id
|
|
||||||
""")
|
|
158
src/sentence.py
|
@ -1,7 +1,7 @@
|
||||||
from transformers import AutoTokenizer, AutoModel
|
from transformers import AutoTokenizer, AutoModel
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from data.main import connect, data_dir
|
from data.main import connect, paths
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -62,7 +62,7 @@ def embed(chunks):
|
||||||
ids = np.concatenate(embedding_ids)
|
ids = np.concatenate(embedding_ids)
|
||||||
|
|
||||||
# save embeddings
|
# save embeddings
|
||||||
save_to = data_dir() / 'embeddings.npy'
|
save_to = paths('data') / 'embeddings.npy'
|
||||||
np.save(save_to, embeddings)
|
np.save(save_to, embeddings)
|
||||||
print(f"embeddings saved: {save_to}")
|
print(f"embeddings saved: {save_to}")
|
||||||
|
|
||||||
|
@ -75,29 +75,28 @@ def embed(chunks):
|
||||||
@click.command('sentence:create-avg-pca-table')
|
@click.command('sentence:create-avg-pca-table')
|
||||||
def create_avg_pca_table():
|
def create_avg_pca_table():
|
||||||
from sklearn.decomposition import PCA
|
from sklearn.decomposition import PCA
|
||||||
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
|
|
||||||
|
|
||||||
|
|
||||||
embeddings = np.load(data_path / 'embeddings.npy')
|
embeddings = np.load(paths('data') / 'embeddings.npy')
|
||||||
embedding_ids = np.load(data_path / 'embedding_ids.npy')
|
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
|
||||||
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
||||||
|
|
||||||
DB = connect()
|
|
||||||
data = DB.query("""
|
with connect() as db:
|
||||||
SELECT
|
data = db.query("""
|
||||||
ids.index
|
SELECT
|
||||||
,s.id
|
ids.index
|
||||||
,s.publisher_id
|
,s.id
|
||||||
,b.ordinal
|
,s.publisher_id
|
||||||
FROM ids
|
,b.ordinal
|
||||||
JOIN top.stories s
|
FROM ids
|
||||||
ON ids.story_id = s.id
|
JOIN top.stories s
|
||||||
JOIN top.publisher_bias pb
|
ON ids.story_id = s.id
|
||||||
ON pb.publisher_id = s.publisher_id
|
JOIN top.publisher_bias pb
|
||||||
JOIN bias_ratings b
|
ON pb.publisher_id = s.publisher_id
|
||||||
ON b.id = pb.bias_id
|
JOIN bias_ratings b
|
||||||
""").df()
|
ON b.id = pb.bias_id
|
||||||
DB.close()
|
""").df()
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for publisher_id, group in data.groupby(['publisher_id']):
|
for publisher_id, group in data.groupby(['publisher_id']):
|
||||||
|
@ -115,47 +114,45 @@ def create_avg_pca_table():
|
||||||
results['second'] = pred[:, 1]
|
results['second'] = pred[:, 1]
|
||||||
|
|
||||||
table_name = "top.publisher_embeddings_pca"
|
table_name = "top.publisher_embeddings_pca"
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
DB.query(f"""
|
db.query(f"""
|
||||||
CREATE OR REPLACE TABLE {table_name} AS
|
CREATE OR REPLACE TABLE {table_name} AS
|
||||||
SELECT
|
SELECT
|
||||||
results.publisher_id as publisher_id
|
results.publisher_id as publisher_id
|
||||||
,results.first as first
|
,results.first as first
|
||||||
,results.second as second
|
,results.second as second
|
||||||
FROM results
|
FROM results
|
||||||
""")
|
""")
|
||||||
DB.close()
|
|
||||||
print(f"created {table_name}")
|
print(f"created {table_name}")
|
||||||
|
|
||||||
|
|
||||||
@click.command('sentence:create-pca-table')
|
@click.command('sentence:create-pca-table')
|
||||||
def create_pca_table():
|
def create_pca_table():
|
||||||
from sklearn.decomposition import PCA
|
from sklearn.decomposition import PCA
|
||||||
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
|
|
||||||
|
|
||||||
embeddings = np.load(data_path / 'embeddings.npy')
|
embeddings = np.load(path('data') / 'embeddings.npy')
|
||||||
embedding_ids = np.load(data_path / 'embedding_ids.npy')
|
embedding_ids = np.load(path('data') / 'embedding_ids.npy')
|
||||||
|
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
data = DB.query("""
|
data = db.query("""
|
||||||
SELECT
|
SELECT
|
||||||
ids.index
|
ids.index
|
||||||
,s.id
|
,s.id
|
||||||
,b.ordinal
|
,b.ordinal
|
||||||
FROM ids
|
FROM ids
|
||||||
JOIN top.stories s
|
JOIN top.stories s
|
||||||
ON ids.story_id = s.id
|
ON ids.story_id = s.id
|
||||||
JOIN top.publisher_bias pb
|
JOIN top.publisher_bias pb
|
||||||
ON pb.publisher_id = s.publisher_id
|
ON pb.publisher_id = s.publisher_id
|
||||||
JOIN bias_ratings b
|
JOIN bias_ratings b
|
||||||
ON b.id = pb.bias_id
|
ON b.id = pb.bias_id
|
||||||
""").df()
|
""").df()
|
||||||
pub = DB.query("""
|
pub = db.query("""
|
||||||
SELECT
|
SELECT
|
||||||
*
|
*
|
||||||
FROM top.publishers
|
FROM top.publishers
|
||||||
""").df()
|
""").df()
|
||||||
DB.close()
|
|
||||||
|
|
||||||
x = embeddings[data['index']]
|
x = embeddings[data['index']]
|
||||||
y = data['ordinal'].to_numpy().reshape(-1, 1)
|
y = data['ordinal'].to_numpy().reshape(-1, 1)
|
||||||
|
@ -166,42 +163,41 @@ def create_pca_table():
|
||||||
|
|
||||||
table_name = f"top.story_embeddings_pca"
|
table_name = f"top.story_embeddings_pca"
|
||||||
|
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
DB.query(f"""
|
db.query(f"""
|
||||||
CREATE OR REPLACE TABLE {table_name} AS
|
CREATE OR REPLACE TABLE {table_name} AS
|
||||||
SELECT
|
SELECT
|
||||||
data.id as story_id
|
data.id as story_id
|
||||||
,data.first as first
|
,data.first as first
|
||||||
,data.second as second
|
,data.second as second
|
||||||
FROM data
|
FROM data
|
||||||
""")
|
""")
|
||||||
DB.close()
|
|
||||||
print(f"created {table_name}")
|
print(f"created {table_name}")
|
||||||
|
|
||||||
@click.command('sentence:create-svm-table')
|
@click.command('sentence:create-svm-table')
|
||||||
def create_svm_table():
|
def create_svm_table():
|
||||||
from sklearn import svm
|
from sklearn import svm
|
||||||
from sklearn.linear_model import SGDClassifier
|
from sklearn.linear_model import SGDClassifier
|
||||||
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
|
|
||||||
|
|
||||||
embeddings = np.load(data_path / 'embeddings.npy')
|
embeddings = np.load(paths('data') / 'embeddings.npy')
|
||||||
embedding_ids = np.load(data_path / 'embedding_ids.npy')
|
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
|
||||||
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
||||||
|
|
||||||
DB = connect()
|
with connect() as db:
|
||||||
data = DB.query("""
|
data = db.query("""
|
||||||
SELECT
|
SELECT
|
||||||
ids.index
|
ids.index
|
||||||
,s.id
|
,s.id
|
||||||
,b.ordinal
|
,b.ordinal
|
||||||
FROM ids
|
FROM ids
|
||||||
JOIN top.stories s
|
JOIN top.stories s
|
||||||
ON ids.story_id = s.id
|
ON ids.story_id = s.id
|
||||||
JOIN top.publisher_bias pb
|
JOIN top.publisher_bias pb
|
||||||
ON pb.publisher_id = s.publisher_id
|
ON pb.publisher_id = s.publisher_id
|
||||||
JOIN bias_ratings b
|
JOIN bias_ratings b
|
||||||
ON b.id = pb.bias_id
|
ON b.id = pb.bias_id
|
||||||
""").df()
|
""").df()
|
||||||
|
|
||||||
x = embeddings[data['index']]
|
x = embeddings[data['index']]
|
||||||
#y = data['ordinal'].to_numpy().reshape(-1, 1)
|
#y = data['ordinal'].to_numpy().reshape(-1, 1)
|
||||||
|
|