commit 5dab7652161aa6249bf8db4d85993f24463013e4 Author: Matt J Date: Sat Oct 29 17:19:59 2022 -0700 mvp of ml pipeline. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fb58064 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +storage/ +__pycache__/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7e18843 --- /dev/null +++ b/Makefile @@ -0,0 +1,8 @@ +all: + python pipeline.py train + +data: + python data.py + +batch: + python batch.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/batch.py b/batch.py new file mode 100644 index 0000000..37bf8e9 --- /dev/null +++ b/batch.py @@ -0,0 +1,70 @@ +import torch +from torch import nn +from torch import optim +from torch.utils.data import DataLoader +from data import FashionDataset +from tqdm import tqdm +from utils import Stage + + +class Batch: + def __init__( + self, + stage: Stage, + model: nn.Module, + device, + loader: DataLoader, + optimizer: optim.Optimizer, + criterion: nn.Module, + ): + """todo""" + self.stage = stage + self.device = device + self.model = model.to(device) + self.loader = loader + self.criterion = criterion + self.optimizer = optimizer + self.loss = 0 + + def run(self, desc): + self.model.train() + epoch = 0 + for epoch, (x, y) in enumerate(tqdm(self.loader, desc=desc)): + self.optimizer.zero_grad() + loss = self._run_batch((x, y)) + loss.backward() # Send loss backwards to accumulate gradients + self.optimizer.step() # Perform a gradient update on the weights of the mode + self.loss += loss.item() + + def _run_batch(self, sample): + true_x, true_y = sample + true_x, true_y = true_x.to(self.device), true_y.to(self.device) + pred_y = self.model(true_x) + loss = self.criterion(pred_y, true_y) + return loss + + +def main(): + model = nn.Conv2d(1, 64, 3) + criterion = torch.nn.CrossEntropyLoss() + optimizer = torch.optim.Adam(model.parameters(), lr=2e-4) + path = "fashion-mnist_train.csv" + dataset = FashionDataset(path) + batch_size = 16 + num_workers = 1 + loader = torch.utils.data.DataLoader( + dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers + ) + batch = Batch( + Stage.TRAIN, + device=torch.device("cpu"), + model=model, + criterion=criterion, + optimizer=optimizer, + loader=loader, + ) + batch.run("test") + + +if __name__ == "__main__": + main() diff --git a/data.py b/data.py new file mode 100644 index 0000000..8605f75 --- /dev/null +++ b/data.py @@ -0,0 +1,54 @@ +from torch.utils.data import Dataset +import numpy as np +import einops +import csv +import torch + + +class FashionDataset(Dataset): + def __init__(self, path: str): + self.path = path + self.x, self.y = self.load() + + def __getitem__(self, idx): + return (self.x[idx], self.y[idx]) + + def __len__(self): + return len(self.x) + + def load(self): + # opening the CSV file + with open(self.path, mode="r") as file: + images = list() + classes = list() + # reading the CSV file + csvFile = csv.reader(file) + # displaying the contents of the CSV file + header = next(csvFile) + limit = 1000 + for line in csvFile: + if limit < 1: + break + classes.append(int(line[:1][0])) + images.append([int(x) for x in line[1:]]) + limit -= 1 + classes = torch.tensor(classes, dtype=torch.long) + images = torch.tensor(images, dtype=torch.float32) + images = einops.rearrange(images, "n (w h) -> n w h", w=28, h=28) + images = einops.repeat( + images, "n w h -> n c (w r_w) (h r_h)", c=1, r_w=8, r_h=8 + ) + return (images, classes) + + +def main(): + path = "fashion-mnist_train.csv" + dataset = FashionDataset(path=path) + print(f"len: {len(dataset)}") + print(f"first shape: {dataset[0][0].shape}") + mean = einops.reduce(dataset[:10], "n w h -> w h", "mean") + print(f"mean shape: {mean.shape}") + + +if __name__ == "__main__": + main() diff --git a/model/cnn.py b/model/cnn.py new file mode 100644 index 0000000..51f983f --- /dev/null +++ b/model/cnn.py @@ -0,0 +1,152 @@ +from torch import nn + + +# the VGG11 architecture +class VGG11(nn.Module): + def __init__(self, in_channels, num_classes=1000): + super(VGG11, self).__init__() + self.in_channels = in_channels + self.num_classes = num_classes + + # convolutional layers + self.conv_layers = nn.Sequential( + nn.Conv2d(self.in_channels, 64, kernel_size=3, padding=1), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2, stride=2), + nn.Conv2d(64, 128, kernel_size=3, padding=1), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2, stride=2), + nn.Conv2d(128, 256, kernel_size=3, padding=1), + nn.ReLU(), + nn.Conv2d(256, 256, kernel_size=3, padding=1), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2, stride=2), + nn.Conv2d(256, 512, kernel_size=3, padding=1), + nn.ReLU(), + nn.Conv2d(512, 512, kernel_size=3, padding=1), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2, stride=2), + nn.Conv2d(512, 512, kernel_size=3, padding=1), + nn.ReLU(), + nn.Conv2d(512, 512, kernel_size=3, padding=1), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2, stride=2), + ) + + # fully connected linear layers + self.linear_layers = nn.Sequential( + nn.Linear(in_features=512 * 7 * 7, out_features=4096), + nn.ReLU(), + nn.Dropout2d(0.5), + nn.Linear(in_features=4096, out_features=4096), + nn.ReLU(), + nn.Dropout2d(0.5), + nn.Linear(in_features=4096, out_features=self.num_classes), + ) + + def forward(self, x): + x = self.conv_layers(x) + # flatten to prepare for the fully connected layers + x = x.view(x.size(0), -1) + x = self.linear_layers(x) + return x + + +class VGG16(nn.Module): + def __init__(self, num_classes=10): + super(VGG16, self).__init__() + self.layer1 = nn.Sequential( + nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(64), + nn.ReLU(), + ) + self.layer2 = nn.Sequential( + nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(64), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2, stride=2), + ) + self.layer3 = nn.Sequential( + nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(128), + nn.ReLU(), + ) + self.layer4 = nn.Sequential( + nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(128), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2, stride=2), + ) + self.layer5 = nn.Sequential( + nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(256), + nn.ReLU(), + ) + self.layer6 = nn.Sequential( + nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(256), + nn.ReLU(), + ) + self.layer7 = nn.Sequential( + nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(256), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2, stride=2), + ) + self.layer8 = nn.Sequential( + nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(512), + nn.ReLU(), + ) + self.layer9 = nn.Sequential( + nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(512), + nn.ReLU(), + ) + self.layer10 = nn.Sequential( + nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(512), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2, stride=2), + ) + self.layer11 = nn.Sequential( + nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(512), + nn.ReLU(), + ) + self.layer12 = nn.Sequential( + nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(512), + nn.ReLU(), + ) + self.layer13 = nn.Sequential( + nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(512), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2, stride=2), + ) + self.fc = nn.Sequential( + nn.Dropout(0.5), nn.Linear(7 * 7 * 512, 4096), nn.ReLU() + ) + self.fc1 = nn.Sequential(nn.Dropout(0.5), nn.Linear(4096, 4096), nn.ReLU()) + self.fc2 = nn.Sequential(nn.Linear(4096, num_classes)) + + def forward(self, x): + out = self.layer1(x) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = self.layer5(out) + out = self.layer6(out) + out = self.layer7(out) + out = self.layer8(out) + out = self.layer9(out) + out = self.layer10(out) + out = self.layer11(out) + out = self.layer12(out) + out = self.layer13(out) + out = out.reshape(out.size(0), -1) + out = self.fc(out) + out = self.fc1(out) + out = self.fc2(out) + return out diff --git a/model/linear.py b/model/linear.py new file mode 100644 index 0000000..a82fa13 --- /dev/null +++ b/model/linear.py @@ -0,0 +1,10 @@ +from torch import nn + + +class DNN(nn.Module): + def __init__(self, in_dim, out_dim): + super(DNN, self).__init__() + self.layer1 = nn.Linear(in_dim, out_dim) + + def forward(self, x): + return self.layer1(x) diff --git a/mpv.py b/mpv.py new file mode 100644 index 0000000..9bc08c8 --- /dev/null +++ b/mpv.py @@ -0,0 +1,158 @@ +# pytorch mlp for multiclass classification +from numpy import vstack +from numpy import argmax +from pandas import read_csv +from sklearn.preprocessing import LabelEncoder +from sklearn.metrics import accuracy_score +from torch import Tensor +from torch.utils.data import Dataset +from torch.utils.data import DataLoader +from torch.utils.data import random_split +from torch.nn import Linear +from torch.nn import ReLU +from torch.nn import Softmax +from torch.nn import Module +from torch.optim import SGD +from torch.nn import CrossEntropyLoss +from torch.nn.init import kaiming_uniform_ +from torch.nn.init import xavier_uniform_ + +# dataset definition +class CSVDataset(Dataset): + # load the dataset + def __init__(self, path): + # load the csv file as a dataframe + df = read_csv(path, header=None) + # store the inputs and outputs + self.X = df.values[:, :-1] + self.y = df.values[:, -1] + # ensure input data is floats + self.X = self.X.astype('float32') + # label encode target and ensure the values are floats + self.y = LabelEncoder().fit_transform(self.y) + + # number of rows in the dataset + def __len__(self): + return len(self.X) + + # get a row at an index + def __getitem__(self, idx): + return [self.X[idx], self.y[idx]] + + # get indexes for train and test rows + def get_splits(self, n_test=0.33): + # determine sizes + test_size = round(n_test * len(self.X)) + train_size = len(self.X) - test_size + # calculate the split + return random_split(self, [train_size, test_size]) + +# model definition +class MLP(Module): + # define model elements + def __init__(self, n_inputs): + super(MLP, self).__init__() + # input to first hidden layer + self.hidden1 = Linear(n_inputs, 10) + kaiming_uniform_(self.hidden1.weight, nonlinearity='relu') + self.act1 = ReLU() + # second hidden layer + self.hidden2 = Linear(10, 8) + kaiming_uniform_(self.hidden2.weight, nonlinearity='relu') + self.act2 = ReLU() + # third hidden layer and output + self.hidden3 = Linear(8, 3) + xavier_uniform_(self.hidden3.weight) + self.act3 = Softmax(dim=1) + + # forward propagate input + def forward(self, X): + # input to first hidden layer + X = self.hidden1(X) + X = self.act1(X) + # second hidden layer + X = self.hidden2(X) + X = self.act2(X) + # output layer + X = self.hidden3(X) + X = self.act3(X) + return X + +# prepare the dataset +def prepare_data(path): + # load the dataset + dataset = CSVDataset(path) + # calculate split + train, test = dataset.get_splits() + # prepare data loaders + train_dl = DataLoader(train, batch_size=32, shuffle=True) + test_dl = DataLoader(test, batch_size=1024, shuffle=False) + return train_dl, test_dl + +# train the model +def train_model(train_dl, model): + # define the optimization + criterion = CrossEntropyLoss() + optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9) + # enumerate epochs + for epoch in range(500): + # enumerate mini batches + for i, (inputs, targets) in enumerate(train_dl): + # clear the gradients + optimizer.zero_grad() + # compute the model output + yhat = model(inputs) + # calculate loss + loss = criterion(yhat, targets) + # credit assignment + loss.backward() + # update model weights + optimizer.step() + +# evaluate the model +def evaluate_model(test_dl, model): + predictions, actuals = list(), list() + for i, (inputs, targets) in enumerate(test_dl): + # evaluate the model on the test set + yhat = model(inputs) + # retrieve numpy array + yhat = yhat.detach().numpy() + actual = targets.numpy() + # convert to class labels + yhat = argmax(yhat, axis=1) + # reshape for stacking + actual = actual.reshape((len(actual), 1)) + yhat = yhat.reshape((len(yhat), 1)) + # store + predictions.append(yhat) + actuals.append(actual) + predictions, actuals = vstack(predictions), vstack(actuals) + # calculate accuracy + acc = accuracy_score(actuals, predictions) + return acc + +# make a class prediction for one row of data +def predict(row, model): + # convert row to data + row = Tensor([row]) + # make prediction + yhat = model(row) + # retrieve numpy array + yhat = yhat.detach().numpy() + return yhat + +# prepare the data +path = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv' +train_dl, test_dl = prepare_data(path) +print(len(train_dl.dataset), len(test_dl.dataset)) +# define the network +model = MLP(4) +# train the model +train_model(train_dl, model) +# evaluate the model +acc = evaluate_model(test_dl, model) +print('Accuracy: %.3f' % acc) +# make a single prediction +row = [5.1,3.5,1.4,0.2] +yhat = predict(row, model) +print('Predicted: %s (class=%d)' % (yhat, argmax(yhat))) diff --git a/pipeline.py b/pipeline.py new file mode 100644 index 0000000..deb5dff --- /dev/null +++ b/pipeline.py @@ -0,0 +1,43 @@ +import click +from batch import Batch +from model.linear import DNN +from model.cnn import VGG16, VGG11 +from data import FashionDataset +from utils import Stage +import torch + + +@click.group() +def cli(): + pass + + +@cli.command() +def train(): + batch_size = 16 + num_workers = 8 + + path = "fashion-mnist_train.csv" + trainset = FashionDataset(path=path) + + trainloader = torch.utils.data.DataLoader( + trainset, batch_size=batch_size, shuffle=False, num_workers=num_workers + ) + model = VGG11(in_channels=1, num_classes=10) + criterion = torch.nn.CrossEntropyLoss() + optimizer = torch.optim.Adam(model.parameters(), lr=2e-4) + batch = Batch( + stage=Stage.TRAIN, + model=model, + device=torch.device("cpu"), + loader=trainloader, + criterion=criterion, + optimizer=optimizer, + ) + batch.run( + "Run run run run. Run run run away. Oh Oh oH OHHHHHHH yayayayayayayayaya! - David Byrne" + ) + + +if __name__ == "__main__": + cli() diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..73dcef7 --- /dev/null +++ b/utils.py @@ -0,0 +1,7 @@ +from enum import Enum, auto + + +class Stage(Enum): + TRAIN = auto() + DEV = auto() + TEST = auto()