add better normalization. add link similarity.
This commit is contained in:
40
src/bias.py
40
src/bias.py
@@ -7,14 +7,16 @@ import os
|
||||
import csv
|
||||
|
||||
def map(rating:str) -> int:
|
||||
|
||||
mapping = {
|
||||
'right' : 0,
|
||||
'left' : 0,
|
||||
'left-center' : 1,
|
||||
'center' : 2,
|
||||
'left' : 3,
|
||||
'allsides' : 4,
|
||||
'right-center' : 5
|
||||
'right-center' : 3,
|
||||
'right' : 4,
|
||||
'allsides' : -1,
|
||||
}
|
||||
|
||||
return mapping[rating]
|
||||
|
||||
|
||||
@@ -35,13 +37,39 @@ def load() -> None:
|
||||
def normalize() -> None:
|
||||
DB = connect()
|
||||
|
||||
DB.sql("""
|
||||
CREATE OR REPLACE TABLE publisher_bias AS
|
||||
WITH cte AS (
|
||||
SELECT
|
||||
p.id
|
||||
,b.bias as label
|
||||
,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity
|
||||
FROM bias_ratings b
|
||||
JOIN publishers p
|
||||
ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95
|
||||
),ranked AS (
|
||||
SELECT
|
||||
id
|
||||
,label
|
||||
,similarity
|
||||
,ROW_NUMBER() OVER(PARTITION BY id ORDER BY similarity DESC) AS rn
|
||||
FROM cte
|
||||
)
|
||||
SELECT
|
||||
id
|
||||
,label
|
||||
FROM ranked
|
||||
WHERE ranked.rn = 1
|
||||
""")
|
||||
|
||||
|
||||
DB.sql("""
|
||||
with cte as (
|
||||
select
|
||||
s.publisher
|
||||
s.publisher_id
|
||||
,count(1) as stories
|
||||
from stories s
|
||||
group by s.publisher
|
||||
group by s.publisher_id
|
||||
)
|
||||
select
|
||||
s.publisher
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
import click
|
||||
from dotenv import load_dotenv
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
...
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_dotenv()
|
||||
import scrape
|
||||
cli.add_command(scrape.download)
|
||||
cli.add_command(scrape.parse)
|
||||
|
||||
155
src/emotion.py
155
src/emotion.py
@@ -9,6 +9,8 @@ from model import BertForMultiLabelClassification
|
||||
from data import connect
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.dates import DateFormatter
|
||||
import matplotlib.dates as mdates
|
||||
|
||||
def data():
|
||||
# load data
|
||||
@@ -126,24 +128,153 @@ def normalize():
|
||||
""")
|
||||
DB.close()
|
||||
|
||||
@click.command("emotion:analyze")
|
||||
def coef_over_time():
|
||||
"""plot and group emotional labels"""
|
||||
DB = connect()
|
||||
|
||||
emotions = DB.sql("""
|
||||
select label from emotions
|
||||
""").df()
|
||||
|
||||
from sklearn import linear_model
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
def results(buckets = '1 month'):
|
||||
results = DB.sql(f"""
|
||||
with cte as (
|
||||
SELECT
|
||||
time_bucket(interval '{buckets}', s.published_at) as date
|
||||
,e.label
|
||||
,COUNT(1) AS stories
|
||||
FROM stories s
|
||||
JOIN story_emotions se
|
||||
ON s.id = se.story_id
|
||||
JOIN emotions e
|
||||
ON e.id = se.emotion_id
|
||||
WHERE YEAR(s.published_at) < 2022
|
||||
GROUP BY
|
||||
time_bucket(interval '{buckets}', s.published_at)
|
||||
,e.label
|
||||
)
|
||||
,total as (
|
||||
SELECT
|
||||
time_bucket(interval '{buckets}', s.published_at) as date
|
||||
,COUNT(1) AS stories
|
||||
FROM stories s
|
||||
WHERE YEAR(s.published_at) < 2022
|
||||
GROUP BY
|
||||
time_bucket(interval '{buckets}', s.published_at)
|
||||
)
|
||||
select
|
||||
epoch(cte.date) / 60 / 60 / 24 / 365 as date
|
||||
,cte.label
|
||||
,cast(cte.stories as float) / t.stories as stories
|
||||
from cte
|
||||
join total t
|
||||
on t.date = cte.date
|
||||
""").df()
|
||||
return results
|
||||
|
||||
|
||||
def get_coef(label):
|
||||
reg = linear_model.LinearRegression()
|
||||
df = results[results['label'] == label]
|
||||
x = df['date'].to_numpy().reshape(-1, 1)
|
||||
y = df['stories']
|
||||
x_train, x_test = train_test_split(x)
|
||||
y_train, y_test = train_test_split(y)
|
||||
reg.fit(x_train, y_train)
|
||||
# y_pred = reg.predict(x_test)
|
||||
# sns.lineplot(x=x_test.flatten(), y=y_pred)
|
||||
return reg.coef_
|
||||
|
||||
collection = []
|
||||
results = results('2 year')
|
||||
for emotion in emotions['label']:
|
||||
if emotion == 'neutral':
|
||||
continue
|
||||
coef = get_coef(emotion)[0]
|
||||
if coef > 0:
|
||||
increasing = True
|
||||
else:
|
||||
increasing = False
|
||||
collection.append({'emotion' : emotion, 'coef' : coef, 'increasing' : coef > 0 })
|
||||
pd.DataFrame(collection).sort_values('coef')
|
||||
|
||||
plt.show()
|
||||
|
||||
@click.command("emotion:analyze")
|
||||
def analyze():
|
||||
"""plot and group emotional labels"""
|
||||
DB = connect()
|
||||
|
||||
emotions = DB.sql("""
|
||||
select label from emotions
|
||||
""").df()
|
||||
|
||||
from sklearn import linear_model
|
||||
from sklearn.model_selection import train_test_split
|
||||
def get_coef(emotion):
|
||||
df = DB.sql("""
|
||||
with cte as (
|
||||
SELECT
|
||||
time_bucket(interval '1 month', s.published_at) as date
|
||||
,e.label
|
||||
,COUNT(1) AS stories
|
||||
FROM stories s
|
||||
JOIN story_emotions se
|
||||
ON s.id = se.story_id
|
||||
JOIN emotions e
|
||||
ON e.id = se.emotion_id
|
||||
WHERE YEAR(s.published_at) < 2022
|
||||
--AND e.label in ('neutral', 'annoyance')
|
||||
AND e.label in ('sadness')
|
||||
GROUP BY
|
||||
time_bucket(interval '1 month', s.published_at)
|
||||
,e.label
|
||||
)
|
||||
,total as (
|
||||
SELECT
|
||||
time_bucket(interval '1 month', s.published_at) as date
|
||||
,COUNT(1) AS stories
|
||||
FROM stories s
|
||||
WHERE YEAR(s.published_at) < 2022
|
||||
GROUP BY
|
||||
time_bucket(interval '1 month', s.published_at)
|
||||
)
|
||||
select
|
||||
epoch(cte.date) as date
|
||||
,cte.label
|
||||
--,total.stories as total
|
||||
,cast(cte.stories as float) / e.stories as stories
|
||||
from cte
|
||||
join emotions e
|
||||
--on total.date = cte.date
|
||||
on e.label = cte.label
|
||||
""").df()
|
||||
|
||||
reg = linear_model.LinearRegression()
|
||||
x = df['date'].to_numpy().reshape(-1, 1)
|
||||
y = df['stories']
|
||||
|
||||
x_train, x_test = train_test_split(x)
|
||||
y_train, y_test = train_test_split(y)
|
||||
reg.fit(x_train, y_train)
|
||||
#y_pred = reg.predict(x_test)
|
||||
return reg.coef_
|
||||
|
||||
|
||||
df = DB.sql(f"""{yearly}""").df()
|
||||
df['date'] = pd.to_datetime(df['date'])
|
||||
ax = sns.lineplot(x=df['date'], y=df['stories'], hue=df['label'])
|
||||
#ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
|
||||
plt.locator_params(axis='y', nbins=6)
|
||||
ax.xaxis.set_major_formatter(DateFormatter("%m-%y"))
|
||||
plt.show()
|
||||
|
||||
DB.sql("""
|
||||
WITH grouped as (
|
||||
SELECT
|
||||
YEAR(s.published_at) as year
|
||||
,e.label
|
||||
,COUNT(1) AS stories
|
||||
FROM story_emotions e
|
||||
JOIN stories s
|
||||
ON s.id = e.story_id
|
||||
WHERE YEAR(s.published_at) < 2022
|
||||
AND label = 'annoyance'
|
||||
GROUP BY
|
||||
YEAR(s.published_at)
|
||||
,e.label
|
||||
), total AS (
|
||||
SELECT
|
||||
e.label
|
||||
|
||||
111
src/links.py
Normal file
111
src/links.py
Normal file
@@ -0,0 +1,111 @@
|
||||
from data import connect
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.decomposition import PCA, TruncatedSVD
|
||||
from sklearn.cluster import MiniBatchKMeans
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
|
||||
def to_matrix():
|
||||
"""returns an adjacency matrix of publishers to publisher link frequency"""
|
||||
|
||||
DB = connect()
|
||||
|
||||
bias_map = pd.DataFrame([
|
||||
{'label' :'left', 'value' : 0},
|
||||
{'label' :'left-center', 'value' : 1},
|
||||
{'label' :'center', 'value' : 2},
|
||||
{'label' :'right-center', 'value' : 3},
|
||||
{'label' :'right', 'value' : 4},
|
||||
{'label' :'allsides', 'value' : -1},
|
||||
])
|
||||
bias = DB.sql("""
|
||||
SELECT
|
||||
b.id
|
||||
,b.label
|
||||
,m.value
|
||||
FROM publisher_bias b
|
||||
JOIN bias_map m
|
||||
ON b.label = m.label
|
||||
WHERE value != -1
|
||||
""").df()
|
||||
|
||||
pub = DB.sql("""
|
||||
select
|
||||
p.id
|
||||
,p.name
|
||||
,p.url
|
||||
,b.label
|
||||
,b.value
|
||||
from publishers p
|
||||
left join bias b
|
||||
on b.id = p.id
|
||||
""").df()
|
||||
|
||||
edges = DB.sql("""
|
||||
WITH total as (
|
||||
SELECT
|
||||
s.publisher_id as id
|
||||
,COUNT(1) as stories
|
||||
FROM stories s
|
||||
GROUP BY
|
||||
s.publisher_id
|
||||
), p as (
|
||||
SELECT
|
||||
p.id
|
||||
,stories
|
||||
FROM publishers p
|
||||
LEFT JOIN total t
|
||||
ON t.id = p.id
|
||||
WHERE t.stories >= 20
|
||||
), cte as (
|
||||
SELECT
|
||||
r.publisher_id as child_id
|
||||
,s.publisher_id as parent_id
|
||||
,count(1) as links
|
||||
FROM related_stories r
|
||||
JOIN stories s
|
||||
ON s.id = r.parent_id
|
||||
group by
|
||||
s.publisher_id
|
||||
,r.publisher_id
|
||||
)
|
||||
SELECT
|
||||
p.id as parent_id
|
||||
,cte.child_id
|
||||
,links
|
||||
FROM p
|
||||
left JOIN cte
|
||||
ON p.id = cte.parent_id
|
||||
""").df()
|
||||
|
||||
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
|
||||
|
||||
out = pd.DataFrame(adj.index.values, columns=['id'])
|
||||
out = pd.merge(out, pub, how='left', on='id')
|
||||
|
||||
pca = PCA(n_components=4)
|
||||
pca_out = pca.fit_transform(adj)
|
||||
|
||||
svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
|
||||
svd_out = svd.fit_transform(adj)
|
||||
|
||||
x = svd_out[:, 0]
|
||||
y = svd_out[:, 1]
|
||||
|
||||
x = pca_out[:, 0]
|
||||
y = pca_out[:, 1]
|
||||
sns.scatterplot(x=x, y=y)
|
||||
plt.show()
|
||||
|
||||
kmeans = MiniBatchKMeans(n_clusters=5, random_state=0, batch_size=6, n_init="auto")
|
||||
pred = kmeans.fit_predict(pca_out)
|
||||
|
||||
sns.scatterplot(x=x, y=y, hue=pred)
|
||||
plt.show()
|
||||
|
||||
sns.scatterplot(x=pub['first'], y=pub['second'], hue=pub['bias'])
|
||||
plt.show()
|
||||
@@ -7,6 +7,7 @@ from tqdm import tqdm
|
||||
from data import data_dir, connect
|
||||
from lxml import etree
|
||||
import pandas as pd
|
||||
from urllib.parse import urlparse
|
||||
|
||||
@click.command(name='scrape:load')
|
||||
@click.option('--directory', type=Path, default=data_dir(), show_default=True)
|
||||
@@ -103,12 +104,14 @@ def parse(directory, output_dir):
|
||||
|
||||
url = item.xpath('.//strong/a')[0].get('href')
|
||||
out['url'] = url
|
||||
out['publisher_url_domain'] = urlparse(publisher_url).netloc
|
||||
out['domain'] = urlparse(url).netloc
|
||||
|
||||
item_id = hash((page.stem, url))
|
||||
out['id'] = item_id
|
||||
|
||||
old_id = hash((title, page.stem, publisher_url))
|
||||
out['old_id'] = old_id
|
||||
# old_id = hash((title, page.stem, publisher_url))
|
||||
# out['old_id'] = old_id
|
||||
published.append(out)
|
||||
|
||||
related = item.xpath(".//span[contains(@class, 'mls')]/a")
|
||||
@@ -118,6 +121,7 @@ def parse(directory, output_dir):
|
||||
another['url'] = relation.get('href')
|
||||
another['publisher'] = relation.text
|
||||
another['parent_id'] = item_id
|
||||
another['publisher_domain'] = urlparse(another['url']).netloc
|
||||
others.append(another)
|
||||
df = pd.DataFrame(published)
|
||||
df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
|
||||
@@ -128,6 +132,7 @@ def parse(directory, output_dir):
|
||||
def normalize():
|
||||
"""fix database after load. remove duplicates. create publishers."""
|
||||
DB = connect()
|
||||
|
||||
DB.sql("""
|
||||
DELETE FROM stories
|
||||
WHERE id IN (
|
||||
@@ -146,29 +151,77 @@ def normalize():
|
||||
OR title_ctn > 1
|
||||
)
|
||||
""")
|
||||
|
||||
|
||||
|
||||
DB.sql("""
|
||||
CREATE OR REPLACE TABLE publishers AS
|
||||
with cte as (
|
||||
SELECT
|
||||
s.publisher
|
||||
,s.publisher_url
|
||||
s.publisher as name
|
||||
,s.publisher_url_domain as url
|
||||
FROM stories s
|
||||
GROUP BY
|
||||
s.publisher
|
||||
,s.publisher_url
|
||||
,s.publisher_url_domain
|
||||
), together AS (
|
||||
SELECT
|
||||
COALESCE(cte.publisher, r.publisher) AS publisher
|
||||
,cte.publisher_url
|
||||
COALESCE(cte.name, r.publisher) AS name
|
||||
,COALESCE(cte.url, r.publisher_domain) as url
|
||||
FROM cte
|
||||
FULL OUTER JOIN related_stories r
|
||||
ON cte.publisher = r.publisher
|
||||
ON cte.url = r.publisher_domain
|
||||
)
|
||||
SELECT
|
||||
ROW_NUMBER() OVER() as id
|
||||
,t.*
|
||||
,t.name
|
||||
,t.url
|
||||
FROM together t
|
||||
where t.url is not null
|
||||
GROUP BY
|
||||
publisher
|
||||
,publisher_url
|
||||
name
|
||||
,url
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
alter table stories
|
||||
add column publisher_id bigint
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
update stories
|
||||
set publisher_id = publishers.id
|
||||
from publishers
|
||||
where publishers.url = stories.publisher_url_domain
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
alter table stories alter publisher_id set data type bigint
|
||||
""")
|
||||
|
||||
|
||||
DB.sql("""
|
||||
alter table stories drop publisher;
|
||||
alter table stories drop publisher_url;
|
||||
alter table stories drop publisher_url_domain;
|
||||
alter table stories drop domain;
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
alter table related_stories
|
||||
add column publisher_id bigint
|
||||
""")
|
||||
|
||||
|
||||
DB.sql("""
|
||||
update related_stories
|
||||
set publisher_id = publishers.id
|
||||
from publishers
|
||||
where publishers.url = related_stories.publisher_domain
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
alter table related_stories drop publisher;
|
||||
alter table related_stories drop publisher_domain;
|
||||
""")
|
||||
|
||||
|
||||
@@ -81,3 +81,4 @@ def distance():
|
||||
min_index = (np.argmin(distances))
|
||||
closest = np.unravel_index(min_index, distances.shape)
|
||||
distances.flatten().shape
|
||||
DB.close()
|
||||
|
||||
Reference in New Issue
Block a user