wwu-577/src/plots/links.py

227 lines
6.7 KiB
Python

import click
from data.main import connect, ticklabels, paths
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import silhouette_score
import pandas as pd
@click.command('links:elbow')
def elbow():
from sklearn.cluster import KMeans
save_to = paths('figures') / 'link_cluster_elbow.png'
with connect() as db:
df = db.query("""
SELECT
*
FROM link_edges
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
to_plot = []
for k in range(2, 15):
kmeans = KMeans(n_clusters=k, n_init="auto")
kmeans.fit(pivot)
label = kmeans.labels_
coeff = silhouette_score(pivot, label, metric='euclidean')
to_plot.append({'k': k, 'inertia' : kmeans.inertia_, 'coeff': coeff})
to_plot = pd.DataFrame(to_plot)
ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia)
ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
plt.savefig(save_to)
plt.close()
print(f"saved plot: {save_to}")
# randomly pick 8
@click.command('links:pca-clusters')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def link_pca_clusters(source):
save_to = paths('figures') / f"link_pca_clusters_{source}.png"
with connect() as db:
df = db.query(f"""
SELECT
c.label as cluster
,p.tld
--,b.label as bias
,pca.first
,pca.second
,s.cnt as stories
FROM publisher_clusters_{source} c
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = c.publisher_id
JOIN mbfc.publishers p
ON ps.publisher_id = p.id
JOIN
(
select
p.id as publisher_id
,count(1) as cnt
FROM mbfc.publishers p
GROUP BY
p.id
) s
ON s.publisher_id = p.id
JOIN publisher_pca_{source} pca
ON pca.publisher_id = p.id
""").df()
ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster'])
ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component")
plt.savefig(save_to)
print(f"saved plot: {save_to}")
def test():
with connect() as db:
db.query("""
SELECT
p.id as publisher_id
,p.name
,p.tld
,cast(b.bias_id as int) as bias_id
,count(1) as stories
FROM publishers p
JOIN stories s
ON s.publisher_id = p.id
JOIN publisher_clusters c
ON c.publisher_id = p.id
LEFT JOIN publisher_bias b
ON b.publisher_id = p.id
where bias_id is null
group by
p.id
,p.name
,p.tld
,b.bias_id
ORDER BY count(1) desc
""")
@click.command('links:confusion')
def link_confusion():
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay
filename = "link_confusion.png"
save_to = paths('figures') / filename
with connect() as db:
bias = db.query("""
SELECT
p.id as publisher_id
,b.ordinal
FROM top.publishers p
JOIN top.publisher_bias pb
ON pb.publisher_id = p.id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
df = db.query("""
SELECT
*
FROM top.link_edges
WHERE parent_id in (
select
publisher_id
from bias
)
AND child_id in (
select
publisher_id
from bias
)
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
x = pivot.values
y = bias.sort_values('publisher_id').ordinal
x_train, x_test = train_test_split(x)
y_train, y_test = train_test_split(y)
model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
fig, ax = plt.subplots(figsize=(10, 5))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
plt.savefig(save_to)
plt.close()
print(f"saved plot: {save_to}")
@click.command('links:classifier')
def link_confusion():
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay
save_to = paths('figures') / "link_confusion.png"
with connect() as db:
bias = db.query("""
SELECT
p.id as publisher_id
,p.ordinal
FROM mbfc.publishers p
where ordinal != -1
""").df()
with connect() as db:
df = db.query("""
SELECT
*
FROM link_edges
WHERE parent_id in (
select
publisher_id
from bias
)
AND child_id in (
select
publisher_id
from bias
)
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
publisher_matrix = pd.merge(pivot, bias, left_on='parent_id', right_on='publisher_id')
x = publisher_matrix.loc[:, ~publisher_matrix.columns.isin(['publisher_id', 'ordinal'])].values
y = publisher_matrix['ordinal']
model = KNeighborsClassifier(n_neighbors=5)
model.fit(x, y)
y_pred = model.predict(x)
publisher_matrix['pred'] = y_pred
publisher_matrix
fig, ax = plt.subplots(figsize=(5, 5))
ConfusionMatrixDisplay.from_predictions(publisher_matrix['ordinal'], publisher_matrix['pred'], ax=ax)
ax.set(xticklabels=ticklabels(), yticklabels=ticklabels())
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(save_to)
plt.close()
print(f"saved plot: {save_to}")
# ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
# plt.savefig(out_dir / filename)
# plt.close()
# print(f"saved plot: {filename}")