import click from data.main import connect, ticklabels, paths import seaborn as sns import matplotlib.pyplot as plt import numpy as np from sklearn.metrics import silhouette_score import pandas as pd @click.command('links:elbow') def elbow(): from sklearn.cluster import KMeans save_to = paths('figures') / 'link_cluster_elbow.png' with connect() as db: df = db.query(""" SELECT * FROM link_edges """).df() pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0) to_plot = [] for k in range(2, 15): kmeans = KMeans(n_clusters=k, n_init="auto") kmeans.fit(pivot) label = kmeans.labels_ coeff = silhouette_score(pivot, label, metric='euclidean') to_plot.append({'k': k, 'inertia' : kmeans.inertia_, 'coeff': coeff}) to_plot = pd.DataFrame(to_plot) ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia) ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points") plt.savefig(save_to) plt.close() print(f"saved plot: {save_to}") # randomly pick 8 @click.command('links:pca-clusters') @click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links') def link_pca_clusters(source): save_to = paths('figures') / f"link_pca_clusters_{source}.png" with connect() as db: df = db.query(f""" SELECT c.label as cluster ,p.tld --,b.label as bias ,pca.first ,pca.second ,s.cnt as stories FROM publisher_clusters_{source} c JOIN mbfc.publisher_stories ps ON ps.publisher_id = c.publisher_id JOIN mbfc.publishers p ON ps.publisher_id = p.id JOIN ( select p.id as publisher_id ,count(1) as cnt FROM mbfc.publishers p GROUP BY p.id ) s ON s.publisher_id = p.id JOIN publisher_pca_{source} pca ON pca.publisher_id = p.id """).df() ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster']) ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component") plt.savefig(save_to) print(f"saved plot: {save_to}") def test(): with connect() as db: db.query(""" SELECT p.id as publisher_id ,p.name ,p.tld ,cast(b.bias_id as int) as bias_id ,count(1) as stories FROM publishers p JOIN stories s ON s.publisher_id = p.id JOIN publisher_clusters c ON c.publisher_id = p.id LEFT JOIN publisher_bias b ON b.publisher_id = p.id where bias_id is null group by p.id ,p.name ,p.tld ,b.bias_id ORDER BY count(1) desc """) @click.command('links:confusion') def link_confusion(): from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import ConfusionMatrixDisplay filename = "link_confusion.png" save_to = paths('figures') / filename with connect() as db: bias = db.query(""" SELECT p.id as publisher_id ,b.ordinal FROM top.publishers p JOIN top.publisher_bias pb ON pb.publisher_id = p.id JOIN bias_ratings b ON b.id = pb.bias_id """).df() df = db.query(""" SELECT * FROM top.link_edges WHERE parent_id in ( select publisher_id from bias ) AND child_id in ( select publisher_id from bias ) """).df() pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0) x = pivot.values y = bias.sort_values('publisher_id').ordinal x_train, x_test = train_test_split(x) y_train, y_test = train_test_split(y) model = KNeighborsClassifier(n_neighbors=5) model.fit(x_train, y_train) y_pred = model.predict(x_test) fig, ax = plt.subplots(figsize=(10, 5)) ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax) ticklabels = ['left', 'left-center', 'center', 'right-center', 'right'] ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels) plt.savefig(save_to) plt.close() print(f"saved plot: {save_to}") @click.command('links:classifier') def link_confusion(): from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import ConfusionMatrixDisplay save_to = paths('figures') / "link_confusion.png" with connect() as db: bias = db.query(""" SELECT p.id as publisher_id ,p.ordinal FROM mbfc.publishers p where ordinal != -1 """).df() with connect() as db: df = db.query(""" SELECT * FROM link_edges WHERE parent_id in ( select publisher_id from bias ) AND child_id in ( select publisher_id from bias ) """).df() pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0) publisher_matrix = pd.merge(pivot, bias, left_on='parent_id', right_on='publisher_id') x = publisher_matrix.loc[:, ~publisher_matrix.columns.isin(['publisher_id', 'ordinal'])].values y = publisher_matrix['ordinal'] model = KNeighborsClassifier(n_neighbors=5) model.fit(x, y) y_pred = model.predict(x) publisher_matrix['pred'] = y_pred publisher_matrix fig, ax = plt.subplots(figsize=(5, 5)) ConfusionMatrixDisplay.from_predictions(publisher_matrix['ordinal'], publisher_matrix['pred'], ax=ax) ax.set(xticklabels=ticklabels(), yticklabels=ticklabels()) plt.xticks(rotation=45) plt.tight_layout() plt.savefig(save_to) plt.close() print(f"saved plot: {save_to}") # ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred']) # plt.savefig(out_dir / filename) # plt.close() # print(f"saved plot: {filename}")