finish paper
This commit is contained in:
@@ -8,7 +8,7 @@ from pathlib import Path
|
||||
import os
|
||||
import sys
|
||||
import click
|
||||
from data.main import connect, map_tld, paths, reporting_label_to_int
|
||||
from data.main import connect, map_tld, paths, reporting_label_to_int, bias_label_to_int
|
||||
from random import randint
|
||||
from time import sleep
|
||||
from tqdm import tqdm
|
||||
@@ -128,6 +128,8 @@ def create_tables():
|
||||
df['tld'] = df.tld.apply(map_tld)
|
||||
df['ordinal'] = df.bias.apply(bias_label_to_int)
|
||||
|
||||
df.tld
|
||||
|
||||
with connect() as db:
|
||||
db.sql("""
|
||||
CREATE OR REPLACE TABLE mbfc.publishers AS
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import click
|
||||
from data.main import connect
|
||||
from data.main import connect, paths, ticklabels
|
||||
import pandas as pd
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
@click.command('links:create-table')
|
||||
def create_table():
|
||||
@@ -53,7 +55,6 @@ def create_table():
|
||||
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
||||
def create_pca(source):
|
||||
"""create 2D pca labels"""
|
||||
from sklearn.decomposition import PCA
|
||||
|
||||
table_name = f"publisher_pca_{source}"
|
||||
|
||||
@@ -62,8 +63,6 @@ def create_pca(source):
|
||||
SELECT
|
||||
p.*
|
||||
FROM mbfc.publishers p
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON p.id = ps.publisher_id
|
||||
""").df()
|
||||
df = db.query(f"""
|
||||
SELECT
|
||||
@@ -98,9 +97,10 @@ def create_pca(source):
|
||||
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
||||
def create_clusters(source):
|
||||
"""create link adj. matrix clusters table"""
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
source = 'links'
|
||||
table_name = f"publisher_clusters_{source}"
|
||||
|
||||
with connect() as db:
|
||||
df = db.query(f"""
|
||||
SELECT
|
||||
@@ -113,17 +113,18 @@ def create_clusters(source):
|
||||
SELECT
|
||||
p.*
|
||||
FROM mbfc.publishers p
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.publisher_id = p.id
|
||||
""").df()
|
||||
|
||||
|
||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
k = 8
|
||||
k = 5
|
||||
kmeans = KMeans(n_clusters=k, n_init="auto")
|
||||
pred = kmeans.fit_predict(pivot)
|
||||
out = pivot.reset_index()[['parent_id']]
|
||||
out['label'] = pred
|
||||
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
|
||||
new_table = out[['id', 'label']]
|
||||
|
||||
with connect() as db:
|
||||
db.query(f"""
|
||||
CREATE OR REPLACE TABLE {table_name} AS
|
||||
@@ -132,4 +133,5 @@ def create_clusters(source):
|
||||
,n.label as label
|
||||
FROM new_table n
|
||||
""")
|
||||
|
||||
print(f"created {table_name}")
|
||||
|
||||
Reference in New Issue
Block a user