wwu-577/src/selection.py

49 lines
951 B
Python

from data.main import connect
import pandas as pd
import numpy as np
DB = connect()
edges = DB.query("""
select
*
from link_edges
""").df()
DB.close()
edges
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
DB = connect()
DB.query("create schema top")
DB.query("""
CREATE OR REPLACE TABLE top.publishers AS
SELECT
p.*
FROM publishers p
JOIN select_publishers s
ON s.publisher_id = p.id
""")
DB.query("""
CREATE OR REPLACE TABLE top.stories AS
SELECT
s.*
FROM stories s
JOIN top.publishers p
ON s.publisher_id = p.id
WHERE year(s.published_at) >= 2006
AND year(s.published_at) < 2023
""")
DB.query("""
CREATE OR REPLACE TABLE top.related_stories AS
SELECT
r.*
FROM top.stories s
JOIN related_stories r
ON s.id = r.parent_id
""")