136 lines
4.0 KiB
Python
136 lines
4.0 KiB
Python
import click
|
|
from data.main import connect
|
|
import pandas as pd
|
|
|
|
@click.command('links:create-table')
|
|
def create_table():
|
|
|
|
with connect() as db:
|
|
db.query(f"""
|
|
CREATE OR REPLACE TABLE link_edges AS
|
|
with cte as(
|
|
SELECT
|
|
s.publisher_id as parent_id
|
|
,r.publisher_id as child_id
|
|
,count(1) as links
|
|
FROM stories s
|
|
JOIN related_stories r
|
|
ON s.id = r.parent_id
|
|
group by
|
|
s.publisher_id
|
|
,r.publisher_id
|
|
)
|
|
SELECT
|
|
cte.parent_id
|
|
,cte.child_id
|
|
,cte.links as links
|
|
,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
|
|
,case when cte.links > 0 then 1 else 0 end as onehot
|
|
FROM cte
|
|
WHERE cte.child_id in (
|
|
SELECT
|
|
distinct parent_id
|
|
FROM cte
|
|
)
|
|
AND cte.parent_id in (
|
|
SELECT
|
|
distinct child_id
|
|
FROM cte
|
|
)
|
|
""")
|
|
|
|
db.query("""
|
|
SELECT
|
|
*
|
|
,count(1) over()
|
|
FROM link_edges e
|
|
limit 1
|
|
""")
|
|
|
|
print(f"created link_edges")
|
|
|
|
@click.command('links:create-pca')
|
|
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
|
def create_pca(source):
|
|
"""create 2D pca labels"""
|
|
from sklearn.decomposition import PCA
|
|
|
|
table_name = f"publisher_pca_{source}"
|
|
|
|
with connect() as db:
|
|
pub = db.query("""
|
|
SELECT
|
|
p.*
|
|
FROM mbfc.publishers p
|
|
JOIN mbfc.publisher_stories ps
|
|
ON p.id = ps.publisher_id
|
|
""").df()
|
|
df = db.query(f"""
|
|
SELECT
|
|
parent_id
|
|
,child_id
|
|
,{source} as links
|
|
FROM link_edges
|
|
""").df()
|
|
|
|
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
|
svd = PCA(n_components=2)
|
|
svd_out = svd.fit_transform(pivot)
|
|
out = pivot.reset_index()[['parent_id']]
|
|
out['first'] = svd_out[:, 0]
|
|
out['second'] = svd_out[:, 1]
|
|
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
|
|
|
|
with connect() as db:
|
|
db.query(f"""
|
|
CREATE OR REPLACE TABLE {table_name} AS
|
|
SELECT
|
|
out.id as publisher_id
|
|
,out.first as first
|
|
,out.second as second
|
|
FROM out
|
|
""")
|
|
|
|
print(f"created {table_name}")
|
|
|
|
|
|
@click.command('links:create-clusters')
|
|
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
|
def create_clusters(source):
|
|
"""create link adj. matrix clusters table"""
|
|
from sklearn.cluster import KMeans
|
|
|
|
table_name = f"publisher_clusters_{source}"
|
|
with connect() as db:
|
|
df = db.query(f"""
|
|
SELECT
|
|
parent_id
|
|
,child_id
|
|
,{source} as links
|
|
FROM link_edges
|
|
""").df()
|
|
pub = db.query("""
|
|
SELECT
|
|
p.*
|
|
FROM mbfc.publishers p
|
|
JOIN mbfc.publisher_stories ps
|
|
ON ps.publisher_id = p.id
|
|
""").df()
|
|
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
|
k = 8
|
|
kmeans = KMeans(n_clusters=k, n_init="auto")
|
|
pred = kmeans.fit_predict(pivot)
|
|
out = pivot.reset_index()[['parent_id']]
|
|
out['label'] = pred
|
|
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
|
|
new_table = out[['id', 'label']]
|
|
with connect() as db:
|
|
db.query(f"""
|
|
CREATE OR REPLACE TABLE {table_name} AS
|
|
SELECT
|
|
n.id as publisher_id
|
|
,n.label as label
|
|
FROM new_table n
|
|
""")
|
|
print(f"created {table_name}")
|