mvp of ml pipeline.

2022-10-29 17:19:59 -07:00 · 2022-10-29 17:19:59 -07:00 · 5dab765216
commit 5dab765216
10 changed files with 504 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+storage/
+__pycache__/
--- a/8
+++ b/8
@ -0,0 +1,8 @@
+all:
+	python pipeline.py train
+
+data:
+	python data.py
+
+batch:
+	python batch.py
--- a/init.py
+++ b/init.py
--- a/batch.py
+++ b/batch.py
@ -0,0 +1,70 @@
+import torch
+from torch import nn
+from torch import optim
+from torch.utils.data import DataLoader
+from data import FashionDataset
+from tqdm import tqdm
+from utils import Stage
+
+
+class Batch:
+    def __init__(
+        self,
+        stage: Stage,
+        model: nn.Module,
+        device,
+        loader: DataLoader,
+        optimizer: optim.Optimizer,
+        criterion: nn.Module,
+    ):
+        """todo"""
+        self.stage = stage
+        self.device = device
+        self.model = model.to(device)
+        self.loader = loader
+        self.criterion = criterion
+        self.optimizer = optimizer
+        self.loss = 0
+
+    def run(self, desc):
+        self.model.train()
+        epoch = 0
+        for epoch, (x, y) in enumerate(tqdm(self.loader, desc=desc)):
+            self.optimizer.zero_grad()
+            loss = self._run_batch((x, y))
+            loss.backward()  # Send loss backwards to accumulate gradients
+            self.optimizer.step()  # Perform a gradient update on the weights of the mode
+            self.loss += loss.item()
+
+    def _run_batch(self, sample):
+        true_x, true_y = sample
+        true_x, true_y = true_x.to(self.device), true_y.to(self.device)
+        pred_y = self.model(true_x)
+        loss = self.criterion(pred_y, true_y)
+        return loss
+
+
+def main():
+    model = nn.Conv2d(1, 64, 3)
+    criterion = torch.nn.CrossEntropyLoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
+    path = "fashion-mnist_train.csv"
+    dataset = FashionDataset(path)
+    batch_size = 16
+    num_workers = 1
+    loader = torch.utils.data.DataLoader(
+        dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers
+    )
+    batch = Batch(
+        Stage.TRAIN,
+        device=torch.device("cpu"),
+        model=model,
+        criterion=criterion,
+        optimizer=optimizer,
+        loader=loader,
+    )
+    batch.run("test")
+
+
+if __name__ == "__main__":
+    main()
--- a/data.py
+++ b/data.py
@ -0,0 +1,54 @@
+from torch.utils.data import Dataset
+import numpy as np
+import einops
+import csv
+import torch
+
+
+class FashionDataset(Dataset):
+    def __init__(self, path: str):
+        self.path = path
+        self.x, self.y = self.load()
+
+    def __getitem__(self, idx):
+        return (self.x[idx], self.y[idx])
+
+    def __len__(self):
+        return len(self.x)
+
+    def load(self):
+        # opening the CSV file
+        with open(self.path, mode="r") as file:
+            images = list()
+            classes = list()
+            # reading the CSV file
+            csvFile = csv.reader(file)
+            # displaying the contents of the CSV file
+            header = next(csvFile)
+            limit = 1000
+            for line in csvFile:
+                if limit < 1:
+                    break
+                classes.append(int(line[:1][0]))
+                images.append([int(x) for x in line[1:]])
+                limit -= 1
+            classes = torch.tensor(classes, dtype=torch.long)
+            images = torch.tensor(images, dtype=torch.float32)
+            images = einops.rearrange(images, "n (w h) -> n w h", w=28, h=28)
+            images = einops.repeat(
+                images, "n w h -> n c (w r_w) (h r_h)", c=1, r_w=8, r_h=8
+            )
+            return (images, classes)
+
+
+def main():
+    path = "fashion-mnist_train.csv"
+    dataset = FashionDataset(path=path)
+    print(f"len: {len(dataset)}")
+    print(f"first shape: {dataset[0][0].shape}")
+    mean = einops.reduce(dataset[:10], "n w h -> w h", "mean")
+    print(f"mean shape: {mean.shape}")
+
+
+if __name__ == "__main__":
+    main()
--- a/model/cnn.py
+++ b/model/cnn.py
@ -0,0 +1,152 @@
+from torch import nn
+
+
+# the VGG11 architecture
+class VGG11(nn.Module):
+    def __init__(self, in_channels, num_classes=1000):
+        super(VGG11, self).__init__()
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+
+        # convolutional layers
+        self.conv_layers = nn.Sequential(
+            nn.Conv2d(self.in_channels, 64, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+            nn.Conv2d(64, 128, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+            nn.Conv2d(128, 256, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+            nn.Conv2d(256, 512, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(512, 512, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+            nn.Conv2d(512, 512, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(512, 512, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+        )
+
+        # fully connected linear layers
+        self.linear_layers = nn.Sequential(
+            nn.Linear(in_features=512 * 7 * 7, out_features=4096),
+            nn.ReLU(),
+            nn.Dropout2d(0.5),
+            nn.Linear(in_features=4096, out_features=4096),
+            nn.ReLU(),
+            nn.Dropout2d(0.5),
+            nn.Linear(in_features=4096, out_features=self.num_classes),
+        )
+
+    def forward(self, x):
+        x = self.conv_layers(x)
+        # flatten to prepare for the fully connected layers
+        x = x.view(x.size(0), -1)
+        x = self.linear_layers(x)
+        return x
+
+
+class VGG16(nn.Module):
+    def __init__(self, num_classes=10):
+        super(VGG16, self).__init__()
+        self.layer1 = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(),
+        )
+        self.layer2 = nn.Sequential(
+            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+        )
+        self.layer3 = nn.Sequential(
+            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(),
+        )
+        self.layer4 = nn.Sequential(
+            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+        )
+        self.layer5 = nn.Sequential(
+            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(256),
+            nn.ReLU(),
+        )
+        self.layer6 = nn.Sequential(
+            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(256),
+            nn.ReLU(),
+        )
+        self.layer7 = nn.Sequential(
+            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(256),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+        )
+        self.layer8 = nn.Sequential(
+            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(512),
+            nn.ReLU(),
+        )
+        self.layer9 = nn.Sequential(
+            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(512),
+            nn.ReLU(),
+        )
+        self.layer10 = nn.Sequential(
+            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(512),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+        )
+        self.layer11 = nn.Sequential(
+            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(512),
+            nn.ReLU(),
+        )
+        self.layer12 = nn.Sequential(
+            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(512),
+            nn.ReLU(),
+        )
+        self.layer13 = nn.Sequential(
+            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(512),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+        )
+        self.fc = nn.Sequential(
+            nn.Dropout(0.5), nn.Linear(7 * 7 * 512, 4096), nn.ReLU()
+        )
+        self.fc1 = nn.Sequential(nn.Dropout(0.5), nn.Linear(4096, 4096), nn.ReLU())
+        self.fc2 = nn.Sequential(nn.Linear(4096, num_classes))
+
+    def forward(self, x):
+        out = self.layer1(x)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = self.layer5(out)
+        out = self.layer6(out)
+        out = self.layer7(out)
+        out = self.layer8(out)
+        out = self.layer9(out)
+        out = self.layer10(out)
+        out = self.layer11(out)
+        out = self.layer12(out)
+        out = self.layer13(out)
+        out = out.reshape(out.size(0), -1)
+        out = self.fc(out)
+        out = self.fc1(out)
+        out = self.fc2(out)
+        return out
--- a/model/linear.py
+++ b/model/linear.py
@ -0,0 +1,10 @@
+from torch import nn
+
+
+class DNN(nn.Module):
+    def __init__(self, in_dim, out_dim):
+        super(DNN, self).__init__()
+        self.layer1 = nn.Linear(in_dim, out_dim)
+
+    def forward(self, x):
+        return self.layer1(x)
--- a/mpv.py
+++ b/mpv.py
@ -0,0 +1,158 @@
+# pytorch mlp for multiclass classification
+from numpy import vstack
+from numpy import argmax
+from pandas import read_csv
+from sklearn.preprocessing import LabelEncoder
+from sklearn.metrics import accuracy_score
+from torch import Tensor
+from torch.utils.data import Dataset
+from torch.utils.data import DataLoader
+from torch.utils.data import random_split
+from torch.nn import Linear
+from torch.nn import ReLU
+from torch.nn import Softmax
+from torch.nn import Module
+from torch.optim import SGD
+from torch.nn import CrossEntropyLoss
+from torch.nn.init import kaiming_uniform_
+from torch.nn.init import xavier_uniform_
+ 
+# dataset definition
+class CSVDataset(Dataset):
+    # load the dataset
+    def __init__(self, path):
+        # load the csv file as a dataframe
+        df = read_csv(path, header=None)
+        # store the inputs and outputs
+        self.X = df.values[:, :-1]
+        self.y = df.values[:, -1]
+        # ensure input data is floats
+        self.X = self.X.astype('float32')
+        # label encode target and ensure the values are floats
+        self.y = LabelEncoder().fit_transform(self.y)
+ 
+    # number of rows in the dataset
+    def __len__(self):
+        return len(self.X)
+ 
+    # get a row at an index
+    def __getitem__(self, idx):
+        return [self.X[idx], self.y[idx]]
+ 
+    # get indexes for train and test rows
+    def get_splits(self, n_test=0.33):
+        # determine sizes
+        test_size = round(n_test * len(self.X))
+        train_size = len(self.X) - test_size
+        # calculate the split
+        return random_split(self, [train_size, test_size])
+ 
+# model definition
+class MLP(Module):
+    # define model elements
+    def __init__(self, n_inputs):
+        super(MLP, self).__init__()
+        # input to first hidden layer
+        self.hidden1 = Linear(n_inputs, 10)
+        kaiming_uniform_(self.hidden1.weight, nonlinearity='relu')
+        self.act1 = ReLU()
+        # second hidden layer
+        self.hidden2 = Linear(10, 8)
+        kaiming_uniform_(self.hidden2.weight, nonlinearity='relu')
+        self.act2 = ReLU()
+        # third hidden layer and output
+        self.hidden3 = Linear(8, 3)
+        xavier_uniform_(self.hidden3.weight)
+        self.act3 = Softmax(dim=1)
+ 
+    # forward propagate input
+    def forward(self, X):
+        # input to first hidden layer
+        X = self.hidden1(X)
+        X = self.act1(X)
+        # second hidden layer
+        X = self.hidden2(X)
+        X = self.act2(X)
+        # output layer
+        X = self.hidden3(X)
+        X = self.act3(X)
+        return X
+ 
+# prepare the dataset
+def prepare_data(path):
+    # load the dataset
+    dataset = CSVDataset(path)
+    # calculate split
+    train, test = dataset.get_splits()
+    # prepare data loaders
+    train_dl = DataLoader(train, batch_size=32, shuffle=True)
+    test_dl = DataLoader(test, batch_size=1024, shuffle=False)
+    return train_dl, test_dl
+ 
+# train the model
+def train_model(train_dl, model):
+    # define the optimization
+    criterion = CrossEntropyLoss()
+    optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
+    # enumerate epochs
+    for epoch in range(500):
+        # enumerate mini batches
+        for i, (inputs, targets) in enumerate(train_dl):
+            # clear the gradients
+            optimizer.zero_grad()
+            # compute the model output
+            yhat = model(inputs)
+            # calculate loss
+            loss = criterion(yhat, targets)
+            # credit assignment
+            loss.backward()
+            # update model weights
+            optimizer.step()
+ 
+# evaluate the model
+def evaluate_model(test_dl, model):
+    predictions, actuals = list(), list()
+    for i, (inputs, targets) in enumerate(test_dl):
+        # evaluate the model on the test set
+        yhat = model(inputs)
+        # retrieve numpy array
+        yhat = yhat.detach().numpy()
+        actual = targets.numpy()
+        # convert to class labels
+        yhat = argmax(yhat, axis=1)
+        # reshape for stacking
+        actual = actual.reshape((len(actual), 1))
+        yhat = yhat.reshape((len(yhat), 1))
+        # store
+        predictions.append(yhat)
+        actuals.append(actual)
+    predictions, actuals = vstack(predictions), vstack(actuals)
+    # calculate accuracy
+    acc = accuracy_score(actuals, predictions)
+    return acc
+ 
+# make a class prediction for one row of data
+def predict(row, model):
+    # convert row to data
+    row = Tensor([row])
+    # make prediction
+    yhat = model(row)
+    # retrieve numpy array
+    yhat = yhat.detach().numpy()
+    return yhat
+ 
+# prepare the data
+path = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv'
+train_dl, test_dl = prepare_data(path)
+print(len(train_dl.dataset), len(test_dl.dataset))
+# define the network
+model = MLP(4)
+# train the model
+train_model(train_dl, model)
+# evaluate the model
+acc = evaluate_model(test_dl, model)
+print('Accuracy: %.3f' % acc)
+# make a single prediction
+row = [5.1,3.5,1.4,0.2]
+yhat = predict(row, model)
+print('Predicted: %s (class=%d)' % (yhat, argmax(yhat)))
--- a/pipeline.py
+++ b/pipeline.py
@ -0,0 +1,43 @@
+import click
+from batch import Batch
+from model.linear import DNN
+from model.cnn import VGG16, VGG11
+from data import FashionDataset
+from utils import Stage
+import torch
+
+
+@click.group()
+def cli():
+    pass
+
+
+@cli.command()
+def train():
+    batch_size = 16
+    num_workers = 8
+
+    path = "fashion-mnist_train.csv"
+    trainset = FashionDataset(path=path)
+
+    trainloader = torch.utils.data.DataLoader(
+        trainset, batch_size=batch_size, shuffle=False, num_workers=num_workers
+    )
+    model = VGG11(in_channels=1, num_classes=10)
+    criterion = torch.nn.CrossEntropyLoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
+    batch = Batch(
+        stage=Stage.TRAIN,
+        model=model,
+        device=torch.device("cpu"),
+        loader=trainloader,
+        criterion=criterion,
+        optimizer=optimizer,
+    )
+    batch.run(
+        "Run run run run. Run run run away. Oh Oh oH OHHHHHHH yayayayayayayayaya! - David Byrne"
+    )
+
+
+if __name__ == "__main__":
+    cli()
--- a/utils.py
+++ b/utils.py
@ -0,0 +1,7 @@
+from enum import Enum, auto
+
+
+class Stage(Enum):
+    TRAIN = auto()
+    DEV = auto()
+    TEST = auto()