add exp 4/5

This commit is contained in:
matt
2023-05-17 21:38:21 -07:00
parent 74c2d8afa2
commit 3f7b3ad467
16 changed files with 905 additions and 59 deletions

View File

@@ -335,3 +335,92 @@ def another_norm():
on sv2.id = s.id
limit 5
""")
@click.command('data:create-election-table')
def create_elections_table():
df = pd.read_csv(data_dir() / 'election_dates.csv', sep="|")
df['date'] = pd.to_datetime(df.date)
DB = connect()
DB.query("""
CREATE OR REPLACE TABLE election_dates AS
SELECT
row_number() over() as id
,type
,date
FROM df
""")
DB.query("""
CREATE OR REPLACE TABLE election_distance AS
WITH cte as (
SELECT
day(e.date - s.published_at) as days_away
,e.id as election_id
,e.date as election_date
,s.published_at as publish_date
FROM (
SELECT
DISTINCT
published_at
FROM top.stories
) s
CROSS JOIN election_dates e
) , windowed as (
SELECT
row_number() over(partition by publish_date order by abs(days_away) asc) as rn
,days_away
,publish_date
,election_date
,election_id
FROM cte
)
SELECT
days_away
,publish_date
,election_date
,election_id
FROM windowed
WHERE rn = 1
""")
DB.close()
@click.command('scrape:create-denorm')
def create_denorm():
DB = connect()
DB.sql("create schema denorm")
DB.sql("""
CREATE OR REPLACE TABLE denorm.stories AS
SELECT
s.id as story_id
,s.title
,s.url
,s.published_at
,s.author
,p.name as publisher
,p.tld as tld
,sent.class_id as sentiment
,d.days_away as election_distance
,b.ordinal as bias
,pca.first as link_1
,pca.second as link_2
,e.emotion_id as emotion
FROM top.stories s
JOIN top.publishers p
ON p.id = s.publisher_id
JOIN top.story_sentiments sent
ON s.id = sent.story_id
JOIN election_distance d
ON d.election_date = s.published_at
JOIN publisher_bias pb
ON pb.publisher_id = p.id
JOIN bias_ratings b
ON b.id = pb.bias_id
JOIN top.publisher_pca_onehot pca
ON pca.publisher_id = p.id
JOIN story_emotions e
ON e.story_id = s.id
""")
DB.close()