finish paper

This commit is contained in:
matt
2023-12-15 09:34:18 -08:00
parent a21ed7a7d9
commit f069a37ca6
19 changed files with 547 additions and 315 deletions

View File

@@ -8,7 +8,7 @@ from pathlib import Path
import os
import sys
import click
from data.main import connect, map_tld, paths, reporting_label_to_int
from data.main import connect, map_tld, paths, reporting_label_to_int, bias_label_to_int
from random import randint
from time import sleep
from tqdm import tqdm
@@ -128,6 +128,8 @@ def create_tables():
df['tld'] = df.tld.apply(map_tld)
df['ordinal'] = df.bias.apply(bias_label_to_int)
df.tld
with connect() as db:
db.sql("""
CREATE OR REPLACE TABLE mbfc.publishers AS

View File

@@ -1,6 +1,8 @@
import click
from data.main import connect
from data.main import connect, paths, ticklabels
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
@click.command('links:create-table')
def create_table():
@@ -53,7 +55,6 @@ def create_table():
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_pca(source):
"""create 2D pca labels"""
from sklearn.decomposition import PCA
table_name = f"publisher_pca_{source}"
@@ -62,8 +63,6 @@ def create_pca(source):
SELECT
p.*
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON p.id = ps.publisher_id
""").df()
df = db.query(f"""
SELECT
@@ -98,9 +97,10 @@ def create_pca(source):
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def create_clusters(source):
"""create link adj. matrix clusters table"""
from sklearn.cluster import KMeans
source = 'links'
table_name = f"publisher_clusters_{source}"
with connect() as db:
df = db.query(f"""
SELECT
@@ -113,17 +113,18 @@ def create_clusters(source):
SELECT
p.*
FROM mbfc.publishers p
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = p.id
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
k = 8
k = 5
kmeans = KMeans(n_clusters=k, n_init="auto")
pred = kmeans.fit_predict(pivot)
out = pivot.reset_index()[['parent_id']]
out['label'] = pred
out = pd.merge(out, pub, left_on='parent_id', right_on='id')
new_table = out[['id', 'label']]
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
@@ -132,4 +133,5 @@ def create_clusters(source):
,n.label as label
FROM new_table n
""")
print(f"created {table_name}")