rough draft of paper done.

This commit is contained in:
matt
2023-06-07 20:44:48 -07:00
parent 245f60a7a8
commit 7edb8543a7
38 changed files with 1130 additions and 388 deletions

View File

@@ -2,9 +2,24 @@ import data.main
import data.scrape
import data.factcheck
import data.links
import data.bias
import data.emotion
import data.broken_links
import data.selection
import data.sentence
import data.sentiment
import data.word
__all__ = [
'main'
,'scrape'
,'factcheck'
,'links'
,'bias'
,'emotion'
,'broken_links'
,'selection'
,'sentence'
,'sentiment'
,'word'
]

146
src/data/bias.py Normal file
View File

@@ -0,0 +1,146 @@
import click
from data.main import connect, paths
import pandas as pd
from lxml import etree
from pathlib import Path
import os
import csv
@click.command(name="bias:normalize")
def normalize() -> None:
with connect() as db:
db.sql("""
CREATE OR REPLACE TABLE publisher_bias AS
WITH cte AS (
SELECT
p.id as publisher_id
,b.id as bias_id
,b.bias as label
,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity
FROM bias_ratings b
JOIN top.publishers p
ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95
),ranked AS (
SELECT
publisher_id
,bias_id
,label
,similarity
,ROW_NUMBER() OVER(PARTITION BY publisher_id ORDER BY similarity DESC) AS rn
FROM cte
)
SELECT
publisher_id
,label
,bias_id
FROM ranked
WHERE ranked.rn = 1
""")
mapping = [
{'label' :'left' , 'ordinal': -2},
{'label' :'left-center' , 'ordinal': -1},
{'label' :'center' , 'ordinal': 0},
{'label' :'right-center' , 'ordinal': 1},
{'label' :'right' , 'ordinal': 2},
]
mapping = pd.DataFrame(mapping)
with connect() as db:
db.query("alter table bias_ratings add column ordinal int")
db.query("""
update bias_ratings b
set ordinal = o.ordinal
FROM mapping o
WHERE o.label = b.bias
""")
@click.command(name='bias:parse')
def parse() -> None:
"""parse the save html page of allslides.com bias ratings into a normalized csv file"""
bias_html = paths('data') / 'allsides.html'
parser = etree.HTMLParser()
tree = etree.parse(str(bias_html), parser)
root = tree.getroot()
rows = root.xpath('//table[contains(@class,"views-table")]/tbody/tr')
ratings = []
for row in rows:
rating = dict()
publisher = row.xpath('./td[contains(@class, "source-title")]/a')[0].text
rating['publisher'] = publisher
bias = row.xpath('./td[contains(@class, "views-field-field-bias-image")]/a')[0].get('href')
bias = bias.split('/')[-1]
rating['bias'] = bias
agree = row.xpath('.//span[contains(@class, "agree")]')[0].text
disagree = row.xpath('.//span[contains(@class, "disagree")]')[0].text
rating['agree'] = int(agree)
rating['disagree'] = int(disagree)
ratings.append(rating)
df = pd.DataFrame(ratings)
df.to_csv(paths('data') / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
@click.command(name="bias:load")
def load() -> None:
f = str(paths('data') / "bias_ratings.csv")
with connect() as db:
db.sql(f"""
CREATE TABLE bias_ratings as
select
row_number() over(order by b.publisher) as id
,b.*
from read_csv_auto('{f}') b
""")
@click.command('bias:export')
def export():
with connect() as db:
all_bias = db.query("""
SELECT
id as bias_id
,publisher as name
,bias as label
FROM bias_ratings
ORDER by agree desc
""")
all_bias.df().to_csv(paths('data') / 'TMP_publisher_bias.csv', sep="|", index=False)
with connect() as db:
mapped_bias = db.query("""
SELECT
p.id as publisher_id
,p.name as name
,p.tld as tld
,b.label as bias
,b.bias_id as bias_id
FROM top.publishers p
LEFT JOIN publisher_bias b
ON b.publisher_id = p.id
""")
mapped_bias.df().to_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|", index=False)
@click.command('bias:import-mapped')
def import_mapped():
table_name = "top.publisher_bias"
df = pd.read_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|")
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
publisher_id AS publisher_id
,cast(bias_id AS int) as bias_id
FROM df
WHERE bias_id IS NOT NULL
""")
print(f"created table: {table_name}")

40
src/data/broken_links.py Normal file
View File

@@ -0,0 +1,40 @@
import requests
import seaborn as sns
import matplotlib.pyplot as plt
import click
from data.main import connect
@click.command(name="broken:crawl")
def crawl():
"""crawl story urls checking for link rot or redirects."""
with connect() as db:
urls = db.query("""
select
id
,url
from stories
order by published_at asc
limit 5
""").fetchall()
story_id, url = urls[1]
# url
responses = []
for story_id, url in urls:
out = {'story_id' : story_id, 'final_url' : url, 'timeout' : 0, 'status_code' : 200, 'content_length' : 0}
try:
response = requests.get(url, verify=False, timeout=10)
if len(response.history) > 1:
out['redirect'] = 1
if url != response.url:
out['final_url'] = response.url
out['status_code'] = response.status_code
out['content_length'] = len(response.content)
except requests.exceptions.ReadTimeout as e:
print(f"timeout: {url}")
out['timeout'] = 1
responses.append(out)
sns.histplot(x=hist['cnt'])
plt.show()

484
src/data/emotion.py Normal file
View File

@@ -0,0 +1,484 @@
import click
from tqdm import tqdm
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from train.model import BertForMultiLabelClassification
from data.main import connect, data_dir
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
def data():
# load data
DB = connect()
table = DB.sql("""
SELECT
id,
title
FROM stories
WHERE id NOT IN (
SELECT
DISTINCT story_id
FROM story_emotions
)
ORDER BY id DESC
""").df()
DB.close()
return table
@click.command("emotion:create-table")
def create_table():
"""create the table to hold the title id and labels."""
DB = connect()
table = "story_emotions"
DB.execute("""
CREATE OR REPLACE TABLE {table}
(
story_id BIGINT,
label TEXT,
score REAL
)
""")
DB.close()
print(f"\"{table}\" created")
@click.command("emotion:extract")
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
def extract(chunks):
"""extract emotion class labels from titles and put them in the db"""
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")
model = BertForMultiLabelClassification.from_pretrained("monologg/bert-base-cased-goemotions-original")
model.to(device)
table = data()
chunked = np.array_split(table.to_numpy(), chunks)
for part in tqdm(chunked):
ids = [x[0] for x in part]
docs = [x[1] for x in part]
tokens = tokenizer(docs, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
tokens = tokens.to(device)
results = run(model, tokens, ids)
df = pd.DataFrame(results)
DB = connect()
DB.execute('INSERT INTO story_emotions SELECT * FROM df')
DB.close()
def run(model, tokens, ids):
threshold = 0.1
with torch.no_grad():
outputs = model(**tokens)[0].to('cpu').detach().numpy()
scores = 1 / (1 + np.exp(-outputs)) # Sigmoid
results = []
for i, item in enumerate(scores):
for idx, s in enumerate(item):
if s > threshold:
results.append({"story_id": ids[i], "label" : model.config.id2label[idx], "score": s})
return results
@click.command("emotion:normalize")
def normalize():
"""normalize the emotion tables."""
DB = connect()
DB.sql("""
CREATE OR REPLACE TABLE emotions AS
SELECT
row_number() over() as id
,e.label
,COUNT(1) AS stories
FROM story_emotions e
JOIN stories s
ON s.id = e.story_id
-- WHERE YEAR(s.published_at) < 2022
GROUP BY e.label
HAVING stories > 1000
ORDER BY stories DESC
""")
DB.sql("""
ALTER TABLE story_emotions
ADD COLUMN emotion_id int64
""")
DB.sql("""
UPDATE story_emotions
SET emotion_id = emotions.id
FROM emotions
WHERE emotions.label = story_emotions.label
""")
DB.sql("""
ALTER TABLE story_emotions
DROP COLUMN label
""")
DB.sql("""
SELECT
row_number() over() as id
,e.label
,COUNT(1) AS stories
FROM story_emotions e
JOIN stories s
ON s.id = e.story_id
-- WHERE YEAR(s.published_at) < 2022
GROUP BY e.label
HAVING stories > 1000
ORDER BY stories DESC
""")
DB.close()
@click.command("emotion:analyze")
def coef_over_time():
"""plot and group emotional labels"""
DB = connect()
emotions = DB.sql("""
select label from emotions
""").df()
from sklearn import linear_model
from sklearn.model_selection import train_test_split
def results(buckets = '1 month'):
results = DB.sql(f"""
with cte as (
SELECT
time_bucket(interval '{buckets}', s.published_at) as date
,e.label
,COUNT(1) AS stories
FROM stories s
JOIN story_emotions se
ON s.id = se.story_id
JOIN emotions e
ON e.id = se.emotion_id
WHERE YEAR(s.published_at) < 2022
GROUP BY
time_bucket(interval '{buckets}', s.published_at)
,e.label
)
,total as (
SELECT
time_bucket(interval '{buckets}', s.published_at) as date
,COUNT(1) AS stories
FROM stories s
WHERE YEAR(s.published_at) < 2022
GROUP BY
time_bucket(interval '{buckets}', s.published_at)
)
select
epoch(cte.date) / 60 / 60 / 24 / 365 as date
,cte.label
,cast(cte.stories as float) / t.stories as stories
from cte
join total t
on t.date = cte.date
""").df()
return results
def get_coef(label):
reg = linear_model.LinearRegression()
df = results[results['label'] == label]
x = df['date'].to_numpy().reshape(-1, 1)
y = df['stories']
x_train, x_test = train_test_split(x)
y_train, y_test = train_test_split(y)
reg.fit(x_train, y_train)
# y_pred = reg.predict(x_test)
# sns.lineplot(x=x_test.flatten(), y=y_pred)
return reg.coef_
collection = []
results = results('2 year')
for emotion in emotions['label']:
if emotion == 'neutral':
continue
coef = get_coef(emotion)[0]
if coef > 0:
increasing = True
else:
increasing = False
collection.append({'emotion' : emotion, 'coef' : coef, 'increasing' : coef > 0 })
pd.DataFrame(collection).sort_values('coef')
plt.show()
@click.command("emotion:analyze")
def analyze():
"""plot and group emotional labels"""
DB = connect()
emotions = DB.sql("""
select label from emotions
""").df()
from sklearn import linear_model
from sklearn.model_selection import train_test_split
def get_coef(emotion):
df = DB.sql("""
with cte as (
SELECT
time_bucket(interval '1 month', s.published_at) as date
,e.label
,COUNT(1) AS stories
FROM stories s
JOIN story_emotions se
ON s.id = se.story_id
JOIN emotions e
ON e.id = se.emotion_id
WHERE YEAR(s.published_at) < 2022
--AND e.label in ('neutral', 'annoyance')
AND e.label in ('sadness')
GROUP BY
time_bucket(interval '1 month', s.published_at)
,e.label
)
,total as (
SELECT
time_bucket(interval '1 month', s.published_at) as date
,COUNT(1) AS stories
FROM stories s
WHERE YEAR(s.published_at) < 2022
GROUP BY
time_bucket(interval '1 month', s.published_at)
)
select
epoch(cte.date) as date
,cte.label
--,total.stories as total
,cast(cte.stories as float) / e.stories as stories
from cte
join emotions e
--on total.date = cte.date
on e.label = cte.label
""").df()
reg = linear_model.LinearRegression()
x = df['date'].to_numpy().reshape(-1, 1)
y = df['stories']
x_train, x_test = train_test_split(x)
y_train, y_test = train_test_split(y)
reg.fit(x_train, y_train)
#y_pred = reg.predict(x_test)
return reg.coef_
df = DB.sql(f"""{yearly}""").df()
df['date'] = pd.to_datetime(df['date'])
ax = sns.lineplot(x=df['date'], y=df['stories'], hue=df['label'])
#ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
plt.locator_params(axis='y', nbins=6)
ax.xaxis.set_major_formatter(DateFormatter("%m-%y"))
plt.show()
DB.sql("""
WITH grouped as (
), total AS (
SELECT
e.label
,count(1) as total
FROM grouped s
JOIN story_emotions e
ON e.label = s.label
GROUP BY
e.label
)
SELECT
g.year
,g.label
,100 * (g.stories / CAST(t.total AS float)) AS frac
FROM grouped g
JOIN total t
ON t.label = g.label
ORDER BY g.label, g.year
""")
DB.close()
sns.lineplot(x=df['year'], y=df['frac'], hue=df['label'])
plt.show()
def debug():
from transformers import pipeline
# load data
DB = connect()
table = DB.sql("""
SELECT
id,
title
FROM stories
ORDER BY id DESC
""").df()
DB.close()
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")
chunks = 5000
chunked = np.array_split(table, chunks)
labels = []
ids = []
for chunk in tqdm(chunked):
sentences = chunk['title'].tolist()
label_ids = chunk['id'].tolist()
with torch.no_grad():
emotions = classifier(sentences)
labels.append(emotions)
ids.append(label_ids)
out = pd.DataFrame(np.concatenate(labels).tolist())
out_ids = pd.DataFrame(np.concatenate(ids).tolist(), columns=['story_id'])
out = pd.concat([out_ids, out], axis=1)
DB = connect()
DB.sql("""
CREATE OR REPLACE TABLE story_emotions AS
SELECT
story_id
,label
,score
FROM out
""")
DB.sql("""
CREATE OR REPLACE TABLE emotions AS
SELECT
row_number() over() as id
,label
,count(1) as stories
FROM story_emotions
GROUP BY
label
""")
DB.sql("""
ALTER TABLE story_emotions add emotion_id bigint
""")
DB.sql("""
UPDATE story_emotions
SET emotion_id = emotions.id
FROM emotions
WHERE story_emotions.label = emotions.label
""")
DB.sql("""
ALTER TABLE story_emotions drop column label
""")
DB.sql("""
select
*
from emotions
""")
DB.sql("""
select
* from story_emotions
limit 4
""")
DB.close()
out.to_csv(data_dir() / 'emotions.csv', sep="|")
def another():
DB = connect()
DB.sql("""
select
*
from emotions
""")
DB.sql("""
select
*
from story_emotions
""")
emotions = DB.sql("""
SELECT
YEAR(s.published_at) AS year
,e.label AS emotion
,count(1) AS stories
FROM stories s
JOIN story_emotions se
ON s.id = se.story_id
JOIN emotions e
ON e.id = se.emotion_id
GROUP by
YEAR(s.published_at)
,e.label
""").df()
emotions
sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
plt.show()
pivot = emotions.pivot(index='year', columns='emotion', values='stories')
pivot.reset_index(inplace=True)
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
for emotion in pivot.keys()[1:].tolist():
_ = reg.fit(pivot['year'].to_numpy().reshape(-1, 1), pivot[emotion])
print(f"{emotion}: {reg.coef_[0]}")
fig, ax = plt.subplots()
#sns.lineplot(x=pivot['anger'], y=pivot['joy'])
#sns.lineplot(x=pivot['anger'], y=pivot['surprise'], ax=ax)
sns.lineplot(x=pivot['anger'], y=pivot['fear'], ax=ax)
sns.lineplot(x=pivot[''], y=pivot['fear'], ax=ax)
plt.show()
DB.close()
normalized = DB.sql("""
with cte as (
select
year(s.published_at) as year
,se.label as emotion
,b.label as bias
from stories s
join story_emotions se
on s.id = se.story_id
join publisher_bias b
on b.id = s.publisher_id
where b.label != 'allsides'
and se.label != 'neutral'
)
select
distinct
year
,emotion
,bias
,cast(count(1) over(partition by year, bias, emotion) as float) / count(1) over(partition by year, bias) as group_count
from cte
""").df()
DB.sql("""
select
b.label as bias
,count(1) as stories
from stories s
join story_emotions se
on s.id = se.story_id
join publisher_bias b
on b.id = s.publisher_id
group by
b.label
""").df()
another_pivot = emotional_bias.pivot(index=['bias', 'year'], columns='emotion', values='stories')
another_pivot.reset_index(inplace=True)
sns.lineplot(data=normalized, x='year', y='group_count', hue='bias', style='emotion')
plt.show()
sns.relplot(
data=normalized, x="year", y="group_count", hue="emotion", col='bias', kind="line"
#data=normalized, x="year", y="group_count", hue="emotion", col='bias', kind="line", facet_kws=dict(sharey=False)
)
plt.show()
DB.sql("""
select
*
from another_pivot
""")

View File

@@ -8,7 +8,7 @@ from pathlib import Path
import os
import sys
import click
from data.main import connect, map_tld, paths
from data.main import connect, map_tld, paths, reporting_label_to_int
from random import randint
from time import sleep
from tqdm import tqdm
@@ -155,7 +155,7 @@ def create_tables():
FROM stories s
""").df()
stories['tld'] = stories.url.apply(map_tld)
raw_stories['tld'] = raw_stories.url.apply(map_tld)
with connect() as db:
db.sql("""
@@ -167,5 +167,25 @@ def create_tables():
JOIN mbfc.publishers p
ON p.tld = s.tld
""")
with connect() as db:
data = db.sql("""
select
id,
reporting
from mbfc.publishers p
""").df()
with connect() as db:
db.sql("""
alter table mbfc.publishers add column reporting_ordinal int
""")
data['ordinal'] = data.reporting.apply(reporting_label_to_int)
with connect() as db:
db.sql("""
update mbfc.publishers
set reporting_ordinal = data.ordinal
from data
where data.id = publishers.id
""")

View File

@@ -22,6 +22,8 @@ def paths(name='app'):
return Path(os.environ['DATA_MINING_DOCS_DIR'])
if 'figure' in name:
return Path(os.environ['DATA_MINING_DOCS_DIR']) / 'figures'
if 'model' in name:
return Path(os.environ['DATA_MINING_DATA_DIR']) / 'models'
def connect():
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
@@ -105,3 +107,32 @@ def bias_int_to_label(class_id: int, source: str = 'mbfc') -> str:
except:
print(f"no mapping for {class_id}", file=sys.stderr)
return -1
def reporting_label_to_int(label):
mapping = {
'Very Low': 0,
'Low': 1,
'Mixed': -1,
'Mostly Factual': 3,
'High': 4,
'Very High': 5
}
try:
return mapping[label]
except:
return -1
def save_model(model, name):
import pickle
save_to = paths('models') / name
with open(save_to, 'wb') as file:
pickle.dump(model, file)
print(f"saved model: {save_to}")
def load_model(name):
import pickle
open_from = paths('models') / name
print(f"loading model: {open_from}")
with open(open_from, 'rb') as file:
model = pickle.load(file)
return model

287
src/data/sentence.py Normal file
View File

@@ -0,0 +1,287 @@
import click
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from data.main import connect, paths, save_model, load_model, ticklabels
import numpy as np
import pandas as pd
from tqdm import tqdm
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
@click.option('-c', '--chunks', type=int, default=500, show_default=True)
@click.command("sentence:embed")
def embed(chunks):
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
# load data
with connect() as db:
table = db.sql("""
select
id
,title
from stories
order by id desc
""").df()
# normalize text
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
chunked = np.array_split(table, chunks)
# generate embeddings from list of titles
iterator = tqdm(chunked, 'embedding')
embeddings = []
embedding_ids = []
for _, chunk in enumerate(iterator):
sentences = chunk['title'].tolist()
ids = chunk['id'].tolist()
# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
# Compute token embeddings
with torch.no_grad():
model_output = model(**encoded_input)
# Perform pooling
output = mean_pooling(model_output, encoded_input['attention_mask'])
# Normalize embeddings
output = F.normalize(output, p=2, dim=1)
embeddings.append(output)
embedding_ids.append(ids)
embeddings = np.concatenate(embeddings)
ids = np.concatenate(embedding_ids)
# save embeddings
save_to = paths('data') / 'embeddings.npy'
np.save(save_to, embeddings)
print(f"embeddings saved: {save_to}")
# save ids
save_to = paths('data') / 'embedding_ids.npy'
np.save(save_to, ids)
print(f"ids saved: {save_to}")
@click.command('sentence:create-avg-pca-table')
def create_avg_pca_table():
from sklearn.decomposition import PCA
embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
with connect() as db:
data = db.query("""
SELECT
ids.index
,s.id
,s.publisher_id
,b.ordinal
FROM ids
JOIN top.stories s
ON ids.story_id = s.id
JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
results = []
for publisher_id, group in data.groupby(['publisher_id']):
avg = embeddings[group['index']].mean(axis=0)
ordinal = group['ordinal'].iloc[0]
results.append({'publisher_id' : publisher_id, 'embedding' : avg, 'ordinal' : ordinal})
results = pd.DataFrame(results)
x = np.stack(results['embedding'])
y = results['ordinal']
model = PCA(n_components=2)
pred = model.fit_transform(x)
results['first'] = pred[:, 0]
results['second'] = pred[:, 1]
table_name = "top.publisher_embeddings_pca"
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
results.publisher_id as publisher_id
,results.first as first
,results.second as second
FROM results
""")
print(f"created {table_name}")
@click.command('sentence:create-pca-table')
def create_pca_table():
from sklearn.decomposition import PCA
embeddings = np.load(path('data') / 'embeddings.npy')
embedding_ids = np.load(path('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
with connect() as db:
data = db.query("""
SELECT
ids.index
,s.id
,p.bias
,p.ordinal
FROM ids
JOIN stories s
ON ids.story_id = s.id
JOIN mbfc.publisher_stories ps
ON s.id = ps.story_id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
WHERE p.ordinal != -1
""").df()
pub = db.query("""
SELECT
*
FROM mbfc.publishers
""").df()
x = embeddings[data['index']]
y = data['ordinal'].to_numpy().reshape(-1, 1)
model = PCA(n_components=2)
pred = model.fit_transform(x)
data['first'] = pred[:, 0]
data['second'] = pred[:, 1]
table_name = f"story_embeddings_pca"
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
data.id as story_id
,data.first as first
,data.second as second
FROM data
""")
print(f"created {table_name}")
@click.command('sentence:create-svm-table')
def create_svm_table():
"""sentence to classifier"""
from sklearn import svm
from sklearn.linear_model import SGDClassifier
embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
with connect() as db:
data = db.query("""
SELECT
ids.index
,s.id
,p.ordinal
,p.bias
FROM ids
JOIN stories s
ON ids.story_id = s.id
JOIN mbfc.publisher_stories ps
ON s.id = ps.story_id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
WHERE p.ordinal != -1
""").df()
x = embeddings[data['index']]
y = data['ordinal']
model = SGDClassifier()
model = model.fit(x, y)
# data['pred'] = pred.predict(x)
save_model(model, 'sgdclassifier.pkl')
def interence():
with connect() as db:
bias = db.query("""
SELECT
p.bias
,p.ordinal
FROM mbfc.publishers p
WHERE p.ordinal != -1
GROUP BY
p.bias
,p.ordinal
ORDER BY
p.ordinal
""").df()
sdg = load_model( 'sgdclassifier.pkl')
tokens = tokenizer(["hello, i hate woke culture.", "trump is winning"], padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
output = model(**tokens)
output = mean_pooling(output, tokens['attention_mask'])
output = F.normalize(output, p=2, dim=1)
sdg.predict(output)
tokens
dir(output)
def validation():
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
with connect() as db:
data = db.query("""
SELECT
ids.index
,s.id
,p.ordinal
,p.bias
FROM ids
JOIN stories s
ON ids.story_id = s.id
JOIN mbfc.publisher_stories ps
ON s.id = ps.story_id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
WHERE p.ordinal != -1
""").df()
x = embeddings[data['index']]
y = data['ordinal']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
clf = LinearSVC()
clf.fit(x_train, y_train)
fig, ax = plt.subplots(figsize=(10, 5))
ConfusionMatrixDisplay.from_predictions(y_test, clf.predict(x_test), ax=ax)
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels(), yticklabels=ticklabels())
plt.show()
plt.savefig(save_to)

View File

@@ -20,15 +20,14 @@ def extract(chunks):
# load data
DB = connect()
table = DB.sql("""
select
id
,title
from stories
order by id desc
""").df()
DB.close()
with connect() as db:
table = db.sql("""
select
id
,title
from stories
order by id desc
""").df()
# normalize text
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
@@ -56,12 +55,12 @@ def extract(chunks):
story_ids = np.concatenate(story_ids)
# save embeddings
save_to = data_dir() / 'sentiment.npy'
save_to = paths('data') / 'sentiment.npy'
np.save(save_to, sentiments)
print(f"sentiments saved: {save_to}")
# save ids
save_to = data_dir() / 'sentiment_ids.npy'
save_to = paths('data') / 'sentiment_ids.npy'
np.save(save_to, story_ids)
print(f"ids saved: {save_to}")

93
src/data/word.py Normal file
View File

@@ -0,0 +1,93 @@
import click
from transformers import AutoTokenizer, RobertaModel
import numpy as np
from data.main import connect, paths
from tqdm import tqdm
import torch
from pathlib import Path
@click.command(name="word:max-sequence")
def max_sequence():
"""calculate the maximum token length given the story titles"""
with connect() as db:
longest = db.sql("""
select
title
from stories
order by length(title) desc
limit 5000
""").df()
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
tokens = tokenizer(longest['title'].to_list())
print(f"{max([len(x) for x in tokens['input_ids']])}")
@click.command(name="word:embed")
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
@click.option('--embedding_dir', help="path to save embeddings as np array", type=Path, default=Path(paths('data') / 'embeddings'), show_default=True)
@click.option('--token_dir', help="path to save tokens as np array", type=Path, default=Path(paths('data') / 'tokens'), show_default=True)
@click.option('--device', help="device to process data on", type=str, default="cuda:0", show_default=True)
def embed(chunks, embedding_dir, token_dir, device):
""" given titles, generate tokens and word embeddings and saves to disk """
# init models
device = torch.device(device)
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")
model.to(device)
# load data
with connect() as db:
table = db.sql("""
select
title
from stories
order by id desc
""").df()
# normalize text
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
# generate embeddings from list of titles
chunks = np.array_split(table['title'].to_numpy(), chunks)
chunk_iter = tqdm(chunks, 'embedding')
for i, chunk in enumerate(chunk_iter):
# create tokens, padding to max width
tokens = tokenizer(chunk.tolist(), add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
tokens = tokens.to(device)
with torch.no_grad():
outputs = model(**tokens)
# to disk
hidden = outputs.last_hidden_state.to(torch.device('cpu')).detach().numpy()
np.save(embedding_dir / f"embedding_{i}.npy", hidden)
tokens = tokens.to(torch.device('cpu'))
np.save(token_dir / f"token_{i}.npy", tokens)
@click.command(name="word:distance")
def distance():
"""TODO: measure distance between sequence embeddings"""
distances = distance.cdist(classes, classes, 'euclidean')
np.fill_diagonal(distances, np.inf)
min_index = (np.argmin(distances))
closest = np.unravel_index(min_index, distances.shape)
distances.flatten().shape
# path = paths('data') / 'embeddings'
# chunks = [x for x in path.iterdir() if x.match('*.npy')]
# chunks = sorted(chunks, key=lambda x: int(x.stem.split('_')[1]))
#
# data = None
# for i, f in enumerate(tqdm(chunks)):
# loaded = np.load(f)
# if data is None:
# data = loaded
# else:
# data = np.concatenate([data, loaded])
# if i > 20:
# break
#
# data.shape
#
# np.save(data, paths('data') / 'embeddings.npy')