288 lines
8.5 KiB
Python
288 lines
8.5 KiB
Python
import click
|
|
from transformers import AutoTokenizer, AutoModel
|
|
import torch
|
|
import torch.nn.functional as F
|
|
from data.main import connect, paths, save_model, load_model, ticklabels
|
|
import numpy as np
|
|
import pandas as pd
|
|
from tqdm import tqdm
|
|
|
|
#Mean Pooling - Take attention mask into account for correct averaging
|
|
def mean_pooling(model_output, attention_mask):
|
|
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
|
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
|
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
|
|
|
@click.option('-c', '--chunks', type=int, default=500, show_default=True)
|
|
@click.command("sentence:embed")
|
|
def embed(chunks):
|
|
|
|
# Load model from HuggingFace Hub
|
|
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
|
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
|
|
|
# load data
|
|
with connect() as db:
|
|
table = db.sql("""
|
|
select
|
|
id
|
|
,title
|
|
from stories
|
|
order by id desc
|
|
""").df()
|
|
|
|
# normalize text
|
|
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
|
|
|
|
|
|
chunked = np.array_split(table, chunks)
|
|
# generate embeddings from list of titles
|
|
iterator = tqdm(chunked, 'embedding')
|
|
embeddings = []
|
|
embedding_ids = []
|
|
for _, chunk in enumerate(iterator):
|
|
sentences = chunk['title'].tolist()
|
|
ids = chunk['id'].tolist()
|
|
# Tokenize sentences
|
|
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
|
# Compute token embeddings
|
|
with torch.no_grad():
|
|
model_output = model(**encoded_input)
|
|
# Perform pooling
|
|
output = mean_pooling(model_output, encoded_input['attention_mask'])
|
|
# Normalize embeddings
|
|
output = F.normalize(output, p=2, dim=1)
|
|
embeddings.append(output)
|
|
embedding_ids.append(ids)
|
|
|
|
embeddings = np.concatenate(embeddings)
|
|
ids = np.concatenate(embedding_ids)
|
|
|
|
# save embeddings
|
|
save_to = paths('data') / 'embeddings.npy'
|
|
np.save(save_to, embeddings)
|
|
print(f"embeddings saved: {save_to}")
|
|
|
|
# save ids
|
|
save_to = paths('data') / 'embedding_ids.npy'
|
|
np.save(save_to, ids)
|
|
print(f"ids saved: {save_to}")
|
|
|
|
|
|
@click.command('sentence:create-avg-pca-table')
|
|
def create_avg_pca_table():
|
|
from sklearn.decomposition import PCA
|
|
|
|
|
|
embeddings = np.load(paths('data') / 'embeddings.npy')
|
|
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
|
|
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
|
|
|
|
|
with connect() as db:
|
|
data = db.query("""
|
|
SELECT
|
|
ids.index
|
|
,s.id
|
|
,s.publisher_id
|
|
,b.ordinal
|
|
FROM ids
|
|
JOIN top.stories s
|
|
ON ids.story_id = s.id
|
|
JOIN top.publisher_bias pb
|
|
ON pb.publisher_id = s.publisher_id
|
|
JOIN bias_ratings b
|
|
ON b.id = pb.bias_id
|
|
""").df()
|
|
|
|
results = []
|
|
for publisher_id, group in data.groupby(['publisher_id']):
|
|
avg = embeddings[group['index']].mean(axis=0)
|
|
ordinal = group['ordinal'].iloc[0]
|
|
results.append({'publisher_id' : publisher_id, 'embedding' : avg, 'ordinal' : ordinal})
|
|
results = pd.DataFrame(results)
|
|
|
|
x = np.stack(results['embedding'])
|
|
y = results['ordinal']
|
|
|
|
model = PCA(n_components=2)
|
|
pred = model.fit_transform(x)
|
|
results['first'] = pred[:, 0]
|
|
results['second'] = pred[:, 1]
|
|
|
|
table_name = "top.publisher_embeddings_pca"
|
|
with connect() as db:
|
|
db.query(f"""
|
|
CREATE OR REPLACE TABLE {table_name} AS
|
|
SELECT
|
|
results.publisher_id as publisher_id
|
|
,results.first as first
|
|
,results.second as second
|
|
FROM results
|
|
""")
|
|
|
|
print(f"created {table_name}")
|
|
|
|
|
|
@click.command('sentence:create-pca-table')
|
|
def create_pca_table():
|
|
from sklearn.decomposition import PCA
|
|
|
|
embeddings = np.load(path('data') / 'embeddings.npy')
|
|
embedding_ids = np.load(path('data') / 'embedding_ids.npy')
|
|
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
|
|
|
with connect() as db:
|
|
data = db.query("""
|
|
SELECT
|
|
ids.index
|
|
,s.id
|
|
,p.bias
|
|
,p.ordinal
|
|
FROM ids
|
|
JOIN stories s
|
|
ON ids.story_id = s.id
|
|
JOIN mbfc.publisher_stories ps
|
|
ON s.id = ps.story_id
|
|
JOIN mbfc.publishers p
|
|
ON p.id = ps.publisher_id
|
|
WHERE p.ordinal != -1
|
|
""").df()
|
|
pub = db.query("""
|
|
SELECT
|
|
*
|
|
FROM mbfc.publishers
|
|
""").df()
|
|
|
|
x = embeddings[data['index']]
|
|
y = data['ordinal'].to_numpy().reshape(-1, 1)
|
|
model = PCA(n_components=2)
|
|
pred = model.fit_transform(x)
|
|
data['first'] = pred[:, 0]
|
|
data['second'] = pred[:, 1]
|
|
|
|
table_name = f"story_embeddings_pca"
|
|
with connect() as db:
|
|
db.query(f"""
|
|
CREATE OR REPLACE TABLE {table_name} AS
|
|
SELECT
|
|
data.id as story_id
|
|
,data.first as first
|
|
,data.second as second
|
|
FROM data
|
|
""")
|
|
print(f"created {table_name}")
|
|
|
|
@click.command('sentence:create-svm-table')
|
|
def create_svm_table():
|
|
"""sentence to classifier"""
|
|
|
|
from sklearn import svm
|
|
from sklearn.linear_model import SGDClassifier
|
|
|
|
embeddings = np.load(paths('data') / 'embeddings.npy')
|
|
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
|
|
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
|
|
|
with connect() as db:
|
|
data = db.query("""
|
|
SELECT
|
|
ids.index
|
|
,s.id
|
|
,p.ordinal
|
|
,p.bias
|
|
FROM ids
|
|
JOIN stories s
|
|
ON ids.story_id = s.id
|
|
JOIN mbfc.publisher_stories ps
|
|
ON s.id = ps.story_id
|
|
JOIN mbfc.publishers p
|
|
ON p.id = ps.publisher_id
|
|
WHERE p.ordinal != -1
|
|
""").df()
|
|
|
|
x = embeddings[data['index']]
|
|
y = data['ordinal']
|
|
|
|
model = SGDClassifier()
|
|
model = model.fit(x, y)
|
|
# data['pred'] = pred.predict(x)
|
|
save_model(model, 'sgdclassifier.pkl')
|
|
|
|
def interence():
|
|
|
|
with connect() as db:
|
|
bias = db.query("""
|
|
SELECT
|
|
p.bias
|
|
,p.ordinal
|
|
FROM mbfc.publishers p
|
|
WHERE p.ordinal != -1
|
|
GROUP BY
|
|
p.bias
|
|
,p.ordinal
|
|
ORDER BY
|
|
p.ordinal
|
|
""").df()
|
|
|
|
sdg = load_model( 'sgdclassifier.pkl')
|
|
|
|
|
|
tokens = tokenizer(["hello, i hate woke culture.", "trump is winning"], padding=True, truncation=True, return_tensors='pt')
|
|
|
|
with torch.no_grad():
|
|
output = model(**tokens)
|
|
|
|
output = mean_pooling(output, tokens['attention_mask'])
|
|
|
|
output = F.normalize(output, p=2, dim=1)
|
|
sdg.predict(output)
|
|
|
|
tokens
|
|
dir(output)
|
|
|
|
def validation():
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.svm import LinearSVC
|
|
from sklearn.metrics import ConfusionMatrixDisplay
|
|
import matplotlib.pyplot as plt
|
|
|
|
embeddings = np.load(paths('data') / 'embeddings.npy')
|
|
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
|
|
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
|
|
|
with connect() as db:
|
|
data = db.query("""
|
|
SELECT
|
|
ids.index
|
|
,s.id
|
|
,p.ordinal
|
|
,p.bias
|
|
FROM ids
|
|
JOIN stories s
|
|
ON ids.story_id = s.id
|
|
JOIN mbfc.publisher_stories ps
|
|
ON s.id = ps.story_id
|
|
JOIN mbfc.publishers p
|
|
ON p.id = ps.publisher_id
|
|
WHERE p.ordinal != -1
|
|
""").df()
|
|
|
|
x = embeddings[data['index']]
|
|
y = data['ordinal']
|
|
|
|
|
|
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
|
|
|
|
clf = LinearSVC()
|
|
clf.fit(x_train, y_train)
|
|
|
|
|
|
fig, ax = plt.subplots(figsize=(10, 5))
|
|
ConfusionMatrixDisplay.from_predictions(y_test, clf.predict(x_test), ax=ax)
|
|
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels(), yticklabels=ticklabels())
|
|
plt.show()
|
|
|
|
plt.savefig(save_to)
|