wwu-577/src/data/links.py

136 lines
4.0 KiB
Python

import click
from data.main import connect
import pandas as pd
@click.command('links:create-table')
def create_table():
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE link_edges AS
with cte as(
SELECT
s.publisher_id as parent_id
,r.publisher_id as child_id
,count(1) as links
FROM stories s
JOIN related_stories r
ON s.id = r.parent_id
group by
s.publisher_id
,r.publisher_id
)
SELECT
cte.parent_id
,cte.child_id
,cte.links as links
,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
,case when cte.links > 0 then 1 else 0 end as onehot
FROM cte
WHERE cte.child_id in (
SELECT
distinct parent_id
FROM cte
)
AND cte.parent_id in (
SELECT
distinct child_id
FROM cte
)
""")
db.query("""
SELECT
*
,count(1) over()
FROM link_edges e
limit 1
""")
print(f"created link_edges")
@click.command('links:create-pca')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_pca(source):
"""create 2D pca labels"""
from sklearn.decomposition import PCA
table_name = f"publisher_pca_{source}"
with connect() as db:
pub = db.query("""
SELECT
p.*
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON p.id = ps.publisher_id
""").df()
df = db.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM link_edges
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
svd = PCA(n_components=2)
svd_out = svd.fit_transform(pivot)
out = pivot.reset_index()[['parent_id']]
out['first'] = svd_out[:, 0]
out['second'] = svd_out[:, 1]
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
out.id as publisher_id
,out.first as first
,out.second as second
FROM out
""")
print(f"created {table_name}")
@click.command('links:create-clusters')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_clusters(source):
"""create link adj. matrix clusters table"""
from sklearn.cluster import KMeans
table_name = f"publisher_clusters_{source}"
with connect() as db:
df = db.query(f"""
SELECT
parent_id
,child_id
,{source} as links
FROM link_edges
""").df()
pub = db.query("""
SELECT
p.*
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
k = 8
kmeans = KMeans(n_clusters=k, n_init="auto")
pred = kmeans.fit_predict(pivot)
out = pivot.reset_index()[['parent_id']]
out['label'] = pred
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
new_table = out[['id', 'label']]
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
n.id as publisher_id
,n.label as label
FROM new_table n
""")
print(f"created {table_name}")