finish paper
This commit is contained in:
@@ -8,7 +8,7 @@ from pathlib import Path
|
||||
import os
|
||||
import sys
|
||||
import click
|
||||
from data.main import connect, map_tld, paths, reporting_label_to_int
|
||||
from data.main import connect, map_tld, paths, reporting_label_to_int, bias_label_to_int
|
||||
from random import randint
|
||||
from time import sleep
|
||||
from tqdm import tqdm
|
||||
@@ -128,6 +128,8 @@ def create_tables():
|
||||
df['tld'] = df.tld.apply(map_tld)
|
||||
df['ordinal'] = df.bias.apply(bias_label_to_int)
|
||||
|
||||
df.tld
|
||||
|
||||
with connect() as db:
|
||||
db.sql("""
|
||||
CREATE OR REPLACE TABLE mbfc.publishers AS
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import click
|
||||
from data.main import connect
|
||||
from data.main import connect, paths, ticklabels
|
||||
import pandas as pd
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
@click.command('links:create-table')
|
||||
def create_table():
|
||||
@@ -53,7 +55,6 @@ def create_table():
|
||||
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
||||
def create_pca(source):
|
||||
"""create 2D pca labels"""
|
||||
from sklearn.decomposition import PCA
|
||||
|
||||
table_name = f"publisher_pca_{source}"
|
||||
|
||||
@@ -62,8 +63,6 @@ def create_pca(source):
|
||||
SELECT
|
||||
p.*
|
||||
FROM mbfc.publishers p
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON p.id = ps.publisher_id
|
||||
""").df()
|
||||
df = db.query(f"""
|
||||
SELECT
|
||||
@@ -98,9 +97,10 @@ def create_pca(source):
|
||||
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
||||
def create_clusters(source):
|
||||
"""create link adj. matrix clusters table"""
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
source = 'links'
|
||||
table_name = f"publisher_clusters_{source}"
|
||||
|
||||
with connect() as db:
|
||||
df = db.query(f"""
|
||||
SELECT
|
||||
@@ -113,17 +113,18 @@ def create_clusters(source):
|
||||
SELECT
|
||||
p.*
|
||||
FROM mbfc.publishers p
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.publisher_id = p.id
|
||||
""").df()
|
||||
|
||||
|
||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
k = 8
|
||||
k = 5
|
||||
kmeans = KMeans(n_clusters=k, n_init="auto")
|
||||
pred = kmeans.fit_predict(pivot)
|
||||
out = pivot.reset_index()[['parent_id']]
|
||||
out['label'] = pred
|
||||
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
|
||||
new_table = out[['id', 'label']]
|
||||
|
||||
with connect() as db:
|
||||
db.query(f"""
|
||||
CREATE OR REPLACE TABLE {table_name} AS
|
||||
@@ -132,4 +133,5 @@ def create_clusters(source):
|
||||
,n.label as label
|
||||
FROM new_table n
|
||||
""")
|
||||
|
||||
print(f"created {table_name}")
|
||||
|
||||
@@ -11,7 +11,7 @@ def articles_per_year():
|
||||
save_to = paths('figures') / 'articles_per_year.png'
|
||||
|
||||
with connect() as db:
|
||||
data = DB.query("""
|
||||
data = db.query("""
|
||||
select
|
||||
year(published_at) as year
|
||||
,count(1) as stories
|
||||
@@ -27,6 +27,40 @@ def articles_per_year():
|
||||
plt.savefig(save_to)
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
@click.command('descriptive:articles-per-bias-per-year')
|
||||
def articles_per_bias_per_year():
|
||||
|
||||
save_to = paths('figures') / 'articles_per_bias_per_year.png'
|
||||
|
||||
with connect() as db:
|
||||
data = db.query("""
|
||||
select
|
||||
date_trunc('year', s.published_at) as year
|
||||
,p.bias
|
||||
,count(1) as stories
|
||||
from stories s
|
||||
join mbfc.publisher_stories ps
|
||||
on ps.story_id = s.id
|
||||
join mbfc.publishers p
|
||||
on p.id = ps.publisher_id
|
||||
where year(s.published_at) not in (2005, 2023)
|
||||
and p.bias ilike 'L%'
|
||||
or p.bias ilike 'R%'
|
||||
group by
|
||||
date_trunc('year', s.published_at)
|
||||
,p.bias
|
||||
order by mode(p.ordinal)
|
||||
""").df()
|
||||
|
||||
fig, ax = plt.subplots(figsize=(5, 5))
|
||||
sns.lineplot(x=data.year, y=data.stories, hue=data.bias, ax=ax, palette='rainbow')
|
||||
ax.tick_params(axis='x', rotation=90)
|
||||
ax.set(ylabel="count of stories (#)")
|
||||
plt.legend(loc='upper right')
|
||||
plt.tight_layout()
|
||||
plt.savefig(save_to)
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
@click.command('descriptive:distinct-publishers')
|
||||
def distinct_publishers():
|
||||
save_to = paths('figures') / 'distinct_publishers.png'
|
||||
|
||||
@@ -3,13 +3,14 @@ from data.main import connect, ticklabels, paths
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from sklearn.metrics import silhouette_score
|
||||
import pandas as pd
|
||||
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.metrics import ConfusionMatrixDisplay, silhouette_score
|
||||
|
||||
@click.command('links:elbow')
|
||||
def elbow():
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
save_to = paths('figures') / 'link_cluster_elbow.png'
|
||||
|
||||
@@ -32,7 +33,7 @@ def elbow():
|
||||
to_plot = pd.DataFrame(to_plot)
|
||||
|
||||
ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia)
|
||||
ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
|
||||
ax.set(xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved plot: {save_to}")
|
||||
@@ -43,40 +44,31 @@ def elbow():
|
||||
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
||||
def link_pca_clusters(source):
|
||||
|
||||
source = 'onehot'
|
||||
save_to = paths('figures') / f"link_pca_clusters_{source}.png"
|
||||
|
||||
with connect() as db:
|
||||
df = db.query(f"""
|
||||
SELECT
|
||||
c.label as cluster
|
||||
,p.tld
|
||||
--,b.label as bias
|
||||
,pca.first
|
||||
pca.first
|
||||
,pca.second
|
||||
,s.cnt as stories
|
||||
FROM publisher_clusters_{source} c
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.publisher_id = c.publisher_id
|
||||
JOIN mbfc.publishers p
|
||||
ON ps.publisher_id = p.id
|
||||
JOIN
|
||||
(
|
||||
select
|
||||
p.id as publisher_id
|
||||
,count(1) as cnt
|
||||
FROM mbfc.publishers p
|
||||
GROUP BY
|
||||
p.id
|
||||
) s
|
||||
ON s.publisher_id = p.id
|
||||
,pca.publisher_id
|
||||
,p.ordinal as bias
|
||||
,p.name
|
||||
,clusters.label as cluster
|
||||
,count(1) over() as cnt
|
||||
FROM mbfc.publishers p
|
||||
JOIN publisher_pca_{source} pca
|
||||
ON pca.publisher_id = p.id
|
||||
ON p.id = pca.publisher_id
|
||||
JOIN publisher_clusters_{source} clusters
|
||||
ON p.id = clusters.publisher_id
|
||||
""").df()
|
||||
|
||||
ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster'])
|
||||
ax = sns.scatterplot(df, x='first', y='second', hue='cluster')
|
||||
ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component")
|
||||
plt.savefig(save_to)
|
||||
print(f"saved plot: {save_to}")
|
||||
plt.close()
|
||||
|
||||
|
||||
def test():
|
||||
@@ -108,9 +100,6 @@ def test():
|
||||
|
||||
@click.command('links:confusion')
|
||||
def link_confusion():
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.metrics import ConfusionMatrixDisplay
|
||||
|
||||
filename = "link_confusion.png"
|
||||
save_to = paths('figures') / filename
|
||||
@@ -119,28 +108,13 @@ def link_confusion():
|
||||
bias = db.query("""
|
||||
SELECT
|
||||
p.id as publisher_id
|
||||
,b.ordinal
|
||||
FROM top.publishers p
|
||||
JOIN top.publisher_bias pb
|
||||
ON pb.publisher_id = p.id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
,p.ordinal
|
||||
FROM mbfc.publishers p
|
||||
""").df()
|
||||
|
||||
df = db.query("""
|
||||
SELECT
|
||||
*
|
||||
FROM top.link_edges
|
||||
WHERE parent_id in (
|
||||
select
|
||||
publisher_id
|
||||
from bias
|
||||
)
|
||||
AND child_id in (
|
||||
select
|
||||
publisher_id
|
||||
from bias
|
||||
)
|
||||
FROM link_edges
|
||||
""").df()
|
||||
|
||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
@@ -167,9 +141,6 @@ def link_confusion():
|
||||
|
||||
@click.command('links:classifier')
|
||||
def link_confusion():
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.metrics import ConfusionMatrixDisplay
|
||||
|
||||
save_to = paths('figures') / "link_confusion.png"
|
||||
|
||||
@@ -204,15 +175,20 @@ def link_confusion():
|
||||
x = publisher_matrix.loc[:, ~publisher_matrix.columns.isin(['publisher_id', 'ordinal'])].values
|
||||
y = publisher_matrix['ordinal']
|
||||
|
||||
x_train, x_test = train_test_split(x)
|
||||
y_train, y_test = train_test_split(y)
|
||||
|
||||
model = KNeighborsClassifier(n_neighbors=5)
|
||||
model.fit(x, y)
|
||||
y_pred = model.predict(x)
|
||||
model.fit(x_train, y_train)
|
||||
y_pred = model.predict(x_test)
|
||||
|
||||
publisher_matrix['pred'] = y_pred
|
||||
publisher_matrix
|
||||
|
||||
|
||||
fig, ax = plt.subplots(figsize=(5, 5))
|
||||
ConfusionMatrixDisplay.from_predictions(publisher_matrix['ordinal'], publisher_matrix['pred'], ax=ax)
|
||||
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
|
||||
ax.legend().remove()
|
||||
ax.set(xticklabels=ticklabels(), yticklabels=ticklabels())
|
||||
plt.xticks(rotation=45)
|
||||
plt.tight_layout()
|
||||
|
||||
Reference in New Issue
Block a user