wwu-577/src/data/sentence.py

288 lines
8.5 KiB
Python

import click
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from data.main import connect, paths, save_model, load_model, ticklabels
import numpy as np
import pandas as pd
from tqdm import tqdm
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
@click.option('-c', '--chunks', type=int, default=500, show_default=True)
@click.command("sentence:embed")
def embed(chunks):
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
# load data
with connect() as db:
table = db.sql("""
select
id
,title
from stories
order by id desc
""").df()
# normalize text
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
chunked = np.array_split(table, chunks)
# generate embeddings from list of titles
iterator = tqdm(chunked, 'embedding')
embeddings = []
embedding_ids = []
for _, chunk in enumerate(iterator):
sentences = chunk['title'].tolist()
ids = chunk['id'].tolist()
# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
# Compute token embeddings
with torch.no_grad():
model_output = model(**encoded_input)
# Perform pooling
output = mean_pooling(model_output, encoded_input['attention_mask'])
# Normalize embeddings
output = F.normalize(output, p=2, dim=1)
embeddings.append(output)
embedding_ids.append(ids)
embeddings = np.concatenate(embeddings)
ids = np.concatenate(embedding_ids)
# save embeddings
save_to = paths('data') / 'embeddings.npy'
np.save(save_to, embeddings)
print(f"embeddings saved: {save_to}")
# save ids
save_to = paths('data') / 'embedding_ids.npy'
np.save(save_to, ids)
print(f"ids saved: {save_to}")
@click.command('sentence:create-avg-pca-table')
def create_avg_pca_table():
from sklearn.decomposition import PCA
embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
with connect() as db:
data = db.query("""
SELECT
ids.index
,s.id
,s.publisher_id
,b.ordinal
FROM ids
JOIN top.stories s
ON ids.story_id = s.id
JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
results = []
for publisher_id, group in data.groupby(['publisher_id']):
avg = embeddings[group['index']].mean(axis=0)
ordinal = group['ordinal'].iloc[0]
results.append({'publisher_id' : publisher_id, 'embedding' : avg, 'ordinal' : ordinal})
results = pd.DataFrame(results)
x = np.stack(results['embedding'])
y = results['ordinal']
model = PCA(n_components=2)
pred = model.fit_transform(x)
results['first'] = pred[:, 0]
results['second'] = pred[:, 1]
table_name = "top.publisher_embeddings_pca"
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
results.publisher_id as publisher_id
,results.first as first
,results.second as second
FROM results
""")
print(f"created {table_name}")
@click.command('sentence:create-pca-table')
def create_pca_table():
from sklearn.decomposition import PCA
embeddings = np.load(path('data') / 'embeddings.npy')
embedding_ids = np.load(path('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
with connect() as db:
data = db.query("""
SELECT
ids.index
,s.id
,p.bias
,p.ordinal
FROM ids
JOIN stories s
ON ids.story_id = s.id
JOIN mbfc.publisher_stories ps
ON s.id = ps.story_id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
WHERE p.ordinal != -1
""").df()
pub = db.query("""
SELECT
*
FROM mbfc.publishers
""").df()
x = embeddings[data['index']]
y = data['ordinal'].to_numpy().reshape(-1, 1)
model = PCA(n_components=2)
pred = model.fit_transform(x)
data['first'] = pred[:, 0]
data['second'] = pred[:, 1]
table_name = f"story_embeddings_pca"
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
data.id as story_id
,data.first as first
,data.second as second
FROM data
""")
print(f"created {table_name}")
@click.command('sentence:create-svm-table')
def create_svm_table():
"""sentence to classifier"""
from sklearn import svm
from sklearn.linear_model import SGDClassifier
embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
with connect() as db:
data = db.query("""
SELECT
ids.index
,s.id
,p.ordinal
,p.bias
FROM ids
JOIN stories s
ON ids.story_id = s.id
JOIN mbfc.publisher_stories ps
ON s.id = ps.story_id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
WHERE p.ordinal != -1
""").df()
x = embeddings[data['index']]
y = data['ordinal']
model = SGDClassifier()
model = model.fit(x, y)
# data['pred'] = pred.predict(x)
save_model(model, 'sgdclassifier.pkl')
def interence():
with connect() as db:
bias = db.query("""
SELECT
p.bias
,p.ordinal
FROM mbfc.publishers p
WHERE p.ordinal != -1
GROUP BY
p.bias
,p.ordinal
ORDER BY
p.ordinal
""").df()
sdg = load_model( 'sgdclassifier.pkl')
tokens = tokenizer(["hello, i hate woke culture.", "trump is winning"], padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
output = model(**tokens)
output = mean_pooling(output, tokens['attention_mask'])
output = F.normalize(output, p=2, dim=1)
sdg.predict(output)
tokens
dir(output)
def validation():
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
with connect() as db:
data = db.query("""
SELECT
ids.index
,s.id
,p.ordinal
,p.bias
FROM ids
JOIN stories s
ON ids.story_id = s.id
JOIN mbfc.publisher_stories ps
ON s.id = ps.story_id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
WHERE p.ordinal != -1
""").df()
x = embeddings[data['index']]
y = data['ordinal']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
clf = LinearSVC()
clf.fit(x_train, y_train)
fig, ax = plt.subplots(figsize=(10, 5))
ConfusionMatrixDisplay.from_predictions(y_test, clf.predict(x_test), ax=ax)
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels(), yticklabels=ticklabels())
plt.show()
plt.savefig(save_to)