import click from data.main import connect import pandas as pd @click.command('links:create-table') def create_table(): with connect() as db: db.query(f""" CREATE OR REPLACE TABLE link_edges AS with cte as( SELECT s.publisher_id as parent_id ,r.publisher_id as child_id ,count(1) as links FROM stories s JOIN related_stories r ON s.id = r.parent_id group by s.publisher_id ,r.publisher_id ) SELECT cte.parent_id ,cte.child_id ,cte.links as links ,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized ,case when cte.links > 0 then 1 else 0 end as onehot FROM cte WHERE cte.child_id in ( SELECT distinct parent_id FROM cte ) AND cte.parent_id in ( SELECT distinct child_id FROM cte ) """) db.query(""" SELECT * ,count(1) over() FROM link_edges e limit 1 """) print(f"created link_edges") @click.command('links:create-pca') @click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links') def create_pca(source): """create 2D pca labels""" from sklearn.decomposition import PCA table_name = f"publisher_pca_{source}" with connect() as db: pub = db.query(""" SELECT p.* FROM mbfc.publishers p JOIN mbfc.publisher_stories ps ON p.id = ps.publisher_id """).df() df = db.query(f""" SELECT parent_id ,child_id ,{source} as links FROM link_edges """).df() pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0) svd = PCA(n_components=2) svd_out = svd.fit_transform(pivot) out = pivot.reset_index()[['parent_id']] out['first'] = svd_out[:, 0] out['second'] = svd_out[:, 1] out = pd.merge(out, pub, left_on='parent_id', right_on='id') with connect() as db: db.query(f""" CREATE OR REPLACE TABLE {table_name} AS SELECT out.id as publisher_id ,out.first as first ,out.second as second FROM out """) print(f"created {table_name}") @click.command('links:create-clusters') @click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links') def create_clusters(source): """create link adj. matrix clusters table""" from sklearn.cluster import KMeans table_name = f"publisher_clusters_{source}" with connect() as db: df = db.query(f""" SELECT parent_id ,child_id ,{source} as links FROM link_edges """).df() pub = db.query(""" SELECT p.* FROM mbfc.publishers p JOIN mbfc.publisher_stories ps ON ps.publisher_id = p.id """).df() pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0) k = 8 kmeans = KMeans(n_clusters=k, n_init="auto") pred = kmeans.fit_predict(pivot) out = pivot.reset_index()[['parent_id']] out['label'] = pred out = pd.merge(out, pub, left_on='parent_id', right_on='id') new_table = out[['id', 'label']] with connect() as db: db.query(f""" CREATE OR REPLACE TABLE {table_name} AS SELECT n.id as publisher_id ,n.label as label FROM new_table n """) print(f"created {table_name}")