init
This commit is contained in:
commit
b9c63414a0
|
@ -0,0 +1,2 @@
|
|||
*.csv
|
||||
*.swp
|
Binary file not shown.
After Width: | Height: | Size: 20 KiB |
|
@ -0,0 +1,65 @@
|
|||
import polars as pl
|
||||
import duckdb
|
||||
import toml
|
||||
import os
|
||||
from pathlib import Path
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from enum import Enum, auto
|
||||
import click
|
||||
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
|
||||
|
||||
db = duckdb.connect(str(DATA_DIR / 'project.duckdb'))
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
...
|
||||
|
||||
|
||||
class PlotName(str, Enum):
|
||||
TitleLength = "title_len"
|
||||
OutletStories = "outlet_stories"
|
||||
|
||||
@cli.command()
|
||||
@click.option('-n', '--name', required=True, type=click.Choice(PlotName))
|
||||
@click.option('-o', '--output', required=False, type=click.Path())
|
||||
def plot(name: PlotName, output: Path):
|
||||
output = output if output else APP_DIR / f'docs/{name}.png'
|
||||
if name == PlotName.TitleLength:
|
||||
fig, ax = plt.subplots(1,1)
|
||||
data = db.sql("""
|
||||
select
|
||||
length(title) as len
|
||||
from stories
|
||||
""").df()
|
||||
sns.histplot(x=data['len'], bins=50, ax=ax[0])
|
||||
ax[0].set(ylabel="count", xlabel="title length")
|
||||
elif name == PlotName.OutletStories:
|
||||
|
||||
data = db.sql("""
|
||||
with cte as (
|
||||
select
|
||||
count(1) as stories
|
||||
from stories
|
||||
group by outlet
|
||||
)
|
||||
select
|
||||
row_number() over(order by stories desc) as id
|
||||
,log(stories) as log_count
|
||||
from cte
|
||||
""").df()
|
||||
|
||||
fig, ax = plt.subplots(1,1)
|
||||
sns.lineplot(x=data['id'], y=data['log_count'], ax=ax)
|
||||
from matplotlib.ticker import ScalarFormatter
|
||||
ax.set(yscale='log', xlabel="outlet", ylabel="log(count of stories)", majorformater=ScalarFormatter)
|
||||
plt.show()
|
||||
|
||||
else:
|
||||
raise NotImplementedError("option unrecognized")
|
||||
plt.savefig(output)
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
|
@ -0,0 +1,21 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
import duckdb
|
||||
from enum import Enum
|
||||
|
||||
class Data(str, Enum):
|
||||
Titles = 'titles'
|
||||
|
||||
def from_db(t: Data):
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
# APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
|
||||
DB = duckdb.connect(str(DATA_DIR / 'project.duckdb'))
|
||||
if t == Data.Titles:
|
||||
table = DB.sql("""
|
||||
select
|
||||
distinct
|
||||
title
|
||||
from stories
|
||||
limit 100
|
||||
""").df()
|
||||
return table
|
|
@ -0,0 +1,8 @@
|
|||
import sklearn
|
||||
import polars as pl
|
||||
import toml
|
||||
from pathlib import Path
|
||||
|
||||
config = toml.load('/home/user/577/repo/config.toml')
|
||||
app_dir = Path(config.get('app').get('path'))
|
||||
df = pl.read_csv(app_dir / "data/articles.csv")
|
|
@ -0,0 +1,16 @@
|
|||
from transformers import AutoTokenizer, RobertaModel
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
class Model(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.n_classes = 10
|
||||
self.bert = RobertaModel.from_pretrained("roberta-base")
|
||||
self.linear = torch.nn.Linear(self.bert.config.hidden_size, self.n_classes)
|
||||
self.act = torch.nn.Sigmoid()
|
||||
|
||||
def forward(self, x):
|
||||
outs = self.bert(**x)
|
||||
outs = self.act(self.linear(outs.last_hidden_state))
|
||||
return outs
|
|
@ -0,0 +1,5 @@
|
|||
import pandas as pd
|
||||
import math
|
||||
|
||||
df = pd.read_csv('/tmp/attr.csv')
|
||||
((((df.left - 9.1) ** 2) + ((df.right - 11.0) ** 2)) ** 0.5).sort_values()
|
|
@ -0,0 +1,43 @@
|
|||
import click
|
||||
from scipy.spatial import distance
|
||||
from transformers import AutoTokenizer, RobertaModel
|
||||
import numpy as np
|
||||
from model import Model
|
||||
from data import Data, from_db
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
...
|
||||
|
||||
@cli.command()
|
||||
def train():
|
||||
table = from_db(Data.Titles)
|
||||
n_classes = 10
|
||||
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
||||
|
||||
# create tokens, padding to max width
|
||||
tokens = tokenizer(table['title'].apply(str).to_list(), add_special_tokens = True, truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt")
|
||||
pred_y = outputs[:, 0, :]
|
||||
|
||||
model = RobertaModel.from_pretrained("roberta-base")
|
||||
pred_y = model(**inputs)
|
||||
outputs = model(**tokens)
|
||||
|
||||
# linear = torch.nn.Linear(model.config.hidden_size, n_classes)
|
||||
# act = torch.nn.Sigmoid()
|
||||
|
||||
# model = Model()
|
||||
pred_y.last_hidden_state[:, 0, :].shape
|
||||
classes = act(linear(pred_y.last_hidden_state[:, 0, :])).detach()
|
||||
|
||||
@cli.command()
|
||||
def distance():
|
||||
distances = distance.cdist(classes, classes, 'euclidean')
|
||||
np.fill_diagonal(distances, np.inf)
|
||||
min_index = (np.argmin(distances))
|
||||
closest = np.unravel_index(min_index, distances.shape)
|
||||
distances.flatten().shape
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
Loading…
Reference in New Issue