227 lines
6.7 KiB
Python
227 lines
6.7 KiB
Python
import click
|
|
from data.main import connect, ticklabels, paths
|
|
import seaborn as sns
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
from sklearn.metrics import silhouette_score
|
|
import pandas as pd
|
|
|
|
|
|
@click.command('links:elbow')
|
|
def elbow():
|
|
from sklearn.cluster import KMeans
|
|
|
|
save_to = paths('figures') / 'link_cluster_elbow.png'
|
|
|
|
with connect() as db:
|
|
df = db.query("""
|
|
SELECT
|
|
*
|
|
FROM link_edges
|
|
""").df()
|
|
|
|
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
|
|
|
to_plot = []
|
|
for k in range(2, 15):
|
|
kmeans = KMeans(n_clusters=k, n_init="auto")
|
|
kmeans.fit(pivot)
|
|
label = kmeans.labels_
|
|
coeff = silhouette_score(pivot, label, metric='euclidean')
|
|
to_plot.append({'k': k, 'inertia' : kmeans.inertia_, 'coeff': coeff})
|
|
to_plot = pd.DataFrame(to_plot)
|
|
|
|
ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia)
|
|
ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
|
|
plt.savefig(save_to)
|
|
plt.close()
|
|
print(f"saved plot: {save_to}")
|
|
|
|
# randomly pick 8
|
|
|
|
@click.command('links:pca-clusters')
|
|
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
|
def link_pca_clusters(source):
|
|
|
|
save_to = paths('figures') / f"link_pca_clusters_{source}.png"
|
|
|
|
with connect() as db:
|
|
df = db.query(f"""
|
|
SELECT
|
|
c.label as cluster
|
|
,p.tld
|
|
--,b.label as bias
|
|
,pca.first
|
|
,pca.second
|
|
,s.cnt as stories
|
|
FROM publisher_clusters_{source} c
|
|
JOIN mbfc.publisher_stories ps
|
|
ON ps.publisher_id = c.publisher_id
|
|
JOIN mbfc.publishers p
|
|
ON ps.publisher_id = p.id
|
|
JOIN
|
|
(
|
|
select
|
|
p.id as publisher_id
|
|
,count(1) as cnt
|
|
FROM mbfc.publishers p
|
|
GROUP BY
|
|
p.id
|
|
) s
|
|
ON s.publisher_id = p.id
|
|
JOIN publisher_pca_{source} pca
|
|
ON pca.publisher_id = p.id
|
|
""").df()
|
|
|
|
ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster'])
|
|
ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component")
|
|
plt.savefig(save_to)
|
|
print(f"saved plot: {save_to}")
|
|
|
|
|
|
def test():
|
|
|
|
with connect() as db:
|
|
db.query("""
|
|
SELECT
|
|
p.id as publisher_id
|
|
,p.name
|
|
,p.tld
|
|
,cast(b.bias_id as int) as bias_id
|
|
,count(1) as stories
|
|
FROM publishers p
|
|
JOIN stories s
|
|
ON s.publisher_id = p.id
|
|
JOIN publisher_clusters c
|
|
ON c.publisher_id = p.id
|
|
LEFT JOIN publisher_bias b
|
|
ON b.publisher_id = p.id
|
|
where bias_id is null
|
|
group by
|
|
p.id
|
|
,p.name
|
|
,p.tld
|
|
,b.bias_id
|
|
ORDER BY count(1) desc
|
|
""")
|
|
|
|
|
|
@click.command('links:confusion')
|
|
def link_confusion():
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
from sklearn.metrics import ConfusionMatrixDisplay
|
|
|
|
filename = "link_confusion.png"
|
|
save_to = paths('figures') / filename
|
|
|
|
with connect() as db:
|
|
bias = db.query("""
|
|
SELECT
|
|
p.id as publisher_id
|
|
,b.ordinal
|
|
FROM top.publishers p
|
|
JOIN top.publisher_bias pb
|
|
ON pb.publisher_id = p.id
|
|
JOIN bias_ratings b
|
|
ON b.id = pb.bias_id
|
|
""").df()
|
|
|
|
df = db.query("""
|
|
SELECT
|
|
*
|
|
FROM top.link_edges
|
|
WHERE parent_id in (
|
|
select
|
|
publisher_id
|
|
from bias
|
|
)
|
|
AND child_id in (
|
|
select
|
|
publisher_id
|
|
from bias
|
|
)
|
|
""").df()
|
|
|
|
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
|
|
|
x = pivot.values
|
|
y = bias.sort_values('publisher_id').ordinal
|
|
|
|
|
|
x_train, x_test = train_test_split(x)
|
|
y_train, y_test = train_test_split(y)
|
|
|
|
model = KNeighborsClassifier(n_neighbors=5)
|
|
model.fit(x_train, y_train)
|
|
y_pred = model.predict(x_test)
|
|
|
|
|
|
fig, ax = plt.subplots(figsize=(10, 5))
|
|
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
|
|
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
|
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
|
|
plt.savefig(save_to)
|
|
plt.close()
|
|
print(f"saved plot: {save_to}")
|
|
|
|
@click.command('links:classifier')
|
|
def link_confusion():
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
from sklearn.metrics import ConfusionMatrixDisplay
|
|
|
|
save_to = paths('figures') / "link_confusion.png"
|
|
|
|
with connect() as db:
|
|
bias = db.query("""
|
|
SELECT
|
|
p.id as publisher_id
|
|
,p.ordinal
|
|
FROM mbfc.publishers p
|
|
where ordinal != -1
|
|
""").df()
|
|
|
|
with connect() as db:
|
|
df = db.query("""
|
|
SELECT
|
|
*
|
|
FROM link_edges
|
|
WHERE parent_id in (
|
|
select
|
|
publisher_id
|
|
from bias
|
|
)
|
|
AND child_id in (
|
|
select
|
|
publisher_id
|
|
from bias
|
|
)
|
|
""").df()
|
|
|
|
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
|
publisher_matrix = pd.merge(pivot, bias, left_on='parent_id', right_on='publisher_id')
|
|
x = publisher_matrix.loc[:, ~publisher_matrix.columns.isin(['publisher_id', 'ordinal'])].values
|
|
y = publisher_matrix['ordinal']
|
|
|
|
model = KNeighborsClassifier(n_neighbors=5)
|
|
model.fit(x, y)
|
|
y_pred = model.predict(x)
|
|
publisher_matrix['pred'] = y_pred
|
|
publisher_matrix
|
|
|
|
|
|
fig, ax = plt.subplots(figsize=(5, 5))
|
|
ConfusionMatrixDisplay.from_predictions(publisher_matrix['ordinal'], publisher_matrix['pred'], ax=ax)
|
|
ax.set(xticklabels=ticklabels(), yticklabels=ticklabels())
|
|
plt.xticks(rotation=45)
|
|
plt.tight_layout()
|
|
plt.savefig(save_to)
|
|
plt.close()
|
|
print(f"saved plot: {save_to}")
|
|
|
|
# ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
|
|
# plt.savefig(out_dir / filename)
|
|
# plt.close()
|
|
# print(f"saved plot: {filename}")
|