finish paper

This commit is contained in:
matt
2023-12-15 09:34:18 -08:00
parent a21ed7a7d9
commit f069a37ca6
19 changed files with 547 additions and 315 deletions

View File

@@ -8,7 +8,7 @@ from pathlib import Path
import os
import sys
import click
from data.main import connect, map_tld, paths, reporting_label_to_int
from data.main import connect, map_tld, paths, reporting_label_to_int, bias_label_to_int
from random import randint
from time import sleep
from tqdm import tqdm
@@ -128,6 +128,8 @@ def create_tables():
df['tld'] = df.tld.apply(map_tld)
df['ordinal'] = df.bias.apply(bias_label_to_int)
df.tld
with connect() as db:
db.sql("""
CREATE OR REPLACE TABLE mbfc.publishers AS

View File

@@ -1,6 +1,8 @@
import click
from data.main import connect
from data.main import connect, paths, ticklabels
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
@click.command('links:create-table')
def create_table():
@@ -53,7 +55,6 @@ def create_table():
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_pca(source):
"""create 2D pca labels"""
from sklearn.decomposition import PCA
table_name = f"publisher_pca_{source}"
@@ -62,8 +63,6 @@ def create_pca(source):
SELECT
p.*
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON p.id = ps.publisher_id
""").df()
df = db.query(f"""
SELECT
@@ -98,9 +97,10 @@ def create_pca(source):
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_clusters(source):
"""create link adj. matrix clusters table"""
from sklearn.cluster import KMeans
source = 'links'
table_name = f"publisher_clusters_{source}"
with connect() as db:
df = db.query(f"""
SELECT
@@ -113,17 +113,18 @@ def create_clusters(source):
SELECT
p.*
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
k = 8
k = 5
kmeans = KMeans(n_clusters=k, n_init="auto")
pred = kmeans.fit_predict(pivot)
out = pivot.reset_index()[['parent_id']]
out['label'] = pred
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
new_table = out[['id', 'label']]
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
@@ -132,4 +133,5 @@ def create_clusters(source):
,n.label as label
FROM new_table n
""")
print(f"created {table_name}")

View File

@@ -11,7 +11,7 @@ def articles_per_year():
save_to = paths('figures') / 'articles_per_year.png'
with connect() as db:
data = DB.query("""
data = db.query("""
select
year(published_at) as year
,count(1) as stories
@@ -27,6 +27,40 @@ def articles_per_year():
plt.savefig(save_to)
print(f"saved: {save_to}")
@click.command('descriptive:articles-per-bias-per-year')
def articles_per_bias_per_year():
save_to = paths('figures') / 'articles_per_bias_per_year.png'
with connect() as db:
data = db.query("""
select
date_trunc('year', s.published_at) as year
,p.bias
,count(1) as stories
from stories s
join mbfc.publisher_stories ps
on ps.story_id = s.id
join mbfc.publishers p
on p.id = ps.publisher_id
where year(s.published_at) not in (2005, 2023)
and p.bias ilike 'L%'
or p.bias ilike 'R%'
group by
date_trunc('year', s.published_at)
,p.bias
order by mode(p.ordinal)
""").df()
fig, ax = plt.subplots(figsize=(5, 5))
sns.lineplot(x=data.year, y=data.stories, hue=data.bias, ax=ax, palette='rainbow')
ax.tick_params(axis='x', rotation=90)
ax.set(ylabel="count of stories (#)")
plt.legend(loc='upper right')
plt.tight_layout()
plt.savefig(save_to)
print(f"saved: {save_to}")
@click.command('descriptive:distinct-publishers')
def distinct_publishers():
save_to = paths('figures') / 'distinct_publishers.png'

View File

@@ -3,13 +3,14 @@ from data.main import connect, ticklabels, paths
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import silhouette_score
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay, silhouette_score
@click.command('links:elbow')
def elbow():
from sklearn.cluster import KMeans
save_to = paths('figures') / 'link_cluster_elbow.png'
@@ -32,7 +33,7 @@ def elbow():
to_plot = pd.DataFrame(to_plot)
ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia)
ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
ax.set(xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
plt.savefig(save_to)
plt.close()
print(f"saved plot: {save_to}")
@@ -43,40 +44,31 @@ def elbow():
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def link_pca_clusters(source):
source = 'onehot'
save_to = paths('figures') / f"link_pca_clusters_{source}.png"
with connect() as db:
df = db.query(f"""
SELECT
c.label as cluster
,p.tld
--,b.label as bias
,pca.first
pca.first
,pca.second
,s.cnt as stories
FROM publisher_clusters_{source} c
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = c.publisher_id
JOIN mbfc.publishers p
ON ps.publisher_id = p.id
JOIN
(
select
p.id as publisher_id
,count(1) as cnt
FROM mbfc.publishers p
GROUP BY
p.id
) s
ON s.publisher_id = p.id
,pca.publisher_id
,p.ordinal as bias
,p.name
,clusters.label as cluster
,count(1) over() as cnt
FROM mbfc.publishers p
JOIN publisher_pca_{source} pca
ON pca.publisher_id = p.id
ON p.id = pca.publisher_id
JOIN publisher_clusters_{source} clusters
ON p.id = clusters.publisher_id
""").df()
ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster'])
ax = sns.scatterplot(df, x='first', y='second', hue='cluster')
ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component")
plt.savefig(save_to)
print(f"saved plot: {save_to}")
plt.close()
def test():
@@ -108,9 +100,6 @@ def test():
@click.command('links:confusion')
def link_confusion():
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay
filename = "link_confusion.png"
save_to = paths('figures') / filename
@@ -119,28 +108,13 @@ def link_confusion():
bias = db.query("""
SELECT
p.id as publisher_id
,b.ordinal
FROM top.publishers p
JOIN top.publisher_bias pb
ON pb.publisher_id = p.id
JOIN bias_ratings b
ON b.id = pb.bias_id
,p.ordinal
FROM mbfc.publishers p
""").df()
df = db.query("""
SELECT
*
FROM top.link_edges
WHERE parent_id in (
select
publisher_id
from bias
)
AND child_id in (
select
publisher_id
from bias
)
FROM link_edges
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
@@ -167,9 +141,6 @@ def link_confusion():
@click.command('links:classifier')
def link_confusion():
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay
save_to = paths('figures') / "link_confusion.png"
@@ -204,15 +175,20 @@ def link_confusion():
x = publisher_matrix.loc[:, ~publisher_matrix.columns.isin(['publisher_id', 'ordinal'])].values
y = publisher_matrix['ordinal']
x_train, x_test = train_test_split(x)
y_train, y_test = train_test_split(y)
model = KNeighborsClassifier(n_neighbors=5)
model.fit(x, y)
y_pred = model.predict(x)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
publisher_matrix['pred'] = y_pred
publisher_matrix
fig, ax = plt.subplots(figsize=(5, 5))
ConfusionMatrixDisplay.from_predictions(publisher_matrix['ordinal'], publisher_matrix['pred'], ax=ax)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
ax.legend().remove()
ax.set(xticklabels=ticklabels(), yticklabels=ticklabels())
plt.xticks(rotation=45)
plt.tight_layout()