import click from transformers import AutoTokenizer, AutoModel import torch import torch.nn.functional as F from data.main import connect, paths, save_model, load_model, ticklabels import numpy as np import pandas as pd from tqdm import tqdm #Mean Pooling - Take attention mask into account for correct averaging def mean_pooling(model_output, attention_mask): token_embeddings = model_output[0] #First element of model_output contains all token embeddings input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) @click.option('-c', '--chunks', type=int, default=500, show_default=True) @click.command("sentence:embed") def embed(chunks): # Load model from HuggingFace Hub tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') # load data with connect() as db: table = db.sql(""" select id ,title from stories order by id desc """).df() # normalize text table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') chunked = np.array_split(table, chunks) # generate embeddings from list of titles iterator = tqdm(chunked, 'embedding') embeddings = [] embedding_ids = [] for _, chunk in enumerate(iterator): sentences = chunk['title'].tolist() ids = chunk['id'].tolist() # Tokenize sentences encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') # Compute token embeddings with torch.no_grad(): model_output = model(**encoded_input) # Perform pooling output = mean_pooling(model_output, encoded_input['attention_mask']) # Normalize embeddings output = F.normalize(output, p=2, dim=1) embeddings.append(output) embedding_ids.append(ids) embeddings = np.concatenate(embeddings) ids = np.concatenate(embedding_ids) # save embeddings save_to = paths('data') / 'embeddings.npy' np.save(save_to, embeddings) print(f"embeddings saved: {save_to}") # save ids save_to = paths('data') / 'embedding_ids.npy' np.save(save_to, ids) print(f"ids saved: {save_to}") @click.command('sentence:create-avg-pca-table') def create_avg_pca_table(): from sklearn.decomposition import PCA embeddings = np.load(paths('data') / 'embeddings.npy') embedding_ids = np.load(paths('data') / 'embedding_ids.npy') ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index() with connect() as db: data = db.query(""" SELECT ids.index ,s.id ,s.publisher_id ,b.ordinal FROM ids JOIN top.stories s ON ids.story_id = s.id JOIN top.publisher_bias pb ON pb.publisher_id = s.publisher_id JOIN bias_ratings b ON b.id = pb.bias_id """).df() results = [] for publisher_id, group in data.groupby(['publisher_id']): avg = embeddings[group['index']].mean(axis=0) ordinal = group['ordinal'].iloc[0] results.append({'publisher_id' : publisher_id, 'embedding' : avg, 'ordinal' : ordinal}) results = pd.DataFrame(results) x = np.stack(results['embedding']) y = results['ordinal'] model = PCA(n_components=2) pred = model.fit_transform(x) results['first'] = pred[:, 0] results['second'] = pred[:, 1] table_name = "top.publisher_embeddings_pca" with connect() as db: db.query(f""" CREATE OR REPLACE TABLE {table_name} AS SELECT results.publisher_id as publisher_id ,results.first as first ,results.second as second FROM results """) print(f"created {table_name}") @click.command('sentence:create-pca-table') def create_pca_table(): from sklearn.decomposition import PCA embeddings = np.load(path('data') / 'embeddings.npy') embedding_ids = np.load(path('data') / 'embedding_ids.npy') ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index() with connect() as db: data = db.query(""" SELECT ids.index ,s.id ,p.bias ,p.ordinal FROM ids JOIN stories s ON ids.story_id = s.id JOIN mbfc.publisher_stories ps ON s.id = ps.story_id JOIN mbfc.publishers p ON p.id = ps.publisher_id WHERE p.ordinal != -1 """).df() pub = db.query(""" SELECT * FROM mbfc.publishers """).df() x = embeddings[data['index']] y = data['ordinal'].to_numpy().reshape(-1, 1) model = PCA(n_components=2) pred = model.fit_transform(x) data['first'] = pred[:, 0] data['second'] = pred[:, 1] table_name = f"story_embeddings_pca" with connect() as db: db.query(f""" CREATE OR REPLACE TABLE {table_name} AS SELECT data.id as story_id ,data.first as first ,data.second as second FROM data """) print(f"created {table_name}") @click.command('sentence:create-svm-table') def create_svm_table(): """sentence to classifier""" from sklearn import svm from sklearn.linear_model import SGDClassifier embeddings = np.load(paths('data') / 'embeddings.npy') embedding_ids = np.load(paths('data') / 'embedding_ids.npy') ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index() with connect() as db: data = db.query(""" SELECT ids.index ,s.id ,p.ordinal ,p.bias FROM ids JOIN stories s ON ids.story_id = s.id JOIN mbfc.publisher_stories ps ON s.id = ps.story_id JOIN mbfc.publishers p ON p.id = ps.publisher_id WHERE p.ordinal != -1 """).df() x = embeddings[data['index']] y = data['ordinal'] model = SGDClassifier() model = model.fit(x, y) # data['pred'] = pred.predict(x) save_model(model, 'sgdclassifier.pkl') def interence(): with connect() as db: bias = db.query(""" SELECT p.bias ,p.ordinal FROM mbfc.publishers p WHERE p.ordinal != -1 GROUP BY p.bias ,p.ordinal ORDER BY p.ordinal """).df() sdg = load_model( 'sgdclassifier.pkl') tokens = tokenizer(["hello, i hate woke culture.", "trump is winning"], padding=True, truncation=True, return_tensors='pt') with torch.no_grad(): output = model(**tokens) output = mean_pooling(output, tokens['attention_mask']) output = F.normalize(output, p=2, dim=1) sdg.predict(output) tokens dir(output) def validation(): from sklearn.model_selection import train_test_split from sklearn.svm import LinearSVC from sklearn.metrics import ConfusionMatrixDisplay import matplotlib.pyplot as plt embeddings = np.load(paths('data') / 'embeddings.npy') embedding_ids = np.load(paths('data') / 'embedding_ids.npy') ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index() with connect() as db: data = db.query(""" SELECT ids.index ,s.id ,p.ordinal ,p.bias FROM ids JOIN stories s ON ids.story_id = s.id JOIN mbfc.publisher_stories ps ON s.id = ps.story_id JOIN mbfc.publishers p ON p.id = ps.publisher_id WHERE p.ordinal != -1 """).df() x = embeddings[data['index']] y = data['ordinal'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) clf = LinearSVC() clf.fit(x_train, y_train) fig, ax = plt.subplots(figsize=(10, 5)) ConfusionMatrixDisplay.from_predictions(y_test, clf.predict(x_test), ax=ax) ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels(), yticklabels=ticklabels()) plt.show() plt.savefig(save_to)