add hydra config.

remove click. add launch script. add test dir. switch from fashion mnist to generic.
2023-01-26 07:25:07 -08:00 · 2023-01-26 07:25:07 -08:00 · 1f13224c4f
parent 404e39206b
commit 1f13224c4f
11 changed files with 123 additions and 43 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,4 @@
 storage/
 __pycache__/
 outputs/
 .env
--- a/2
+++ b/2
@ -3,7 +3,7 @@ CONDA_ENV=ml_pipeline
 all: run
 run:
-	python src/pipeline.py train
+	./launch.sh
 data:
 	python src/data.py
--- a/README.md
+++ b/README.md
@ -7,9 +7,9 @@ Instead of remembering where to put everything and making a different choice for
 Think of it like a mini-pytorch lightening, with all the fory internals exposed for extension and modification.
-## Usage
+# Usage
-### Install:
+## Install:
 Install the conda requirements:
@ -23,7 +23,7 @@ Which is a proxy for calling:
 conda env updates -n ml_pipeline --file environment.yml
 ```
-### Run:
+## Run:
 Run the code on MNIST with the following command:
@ -31,3 +31,23 @@ Run the code on MNIST with the following command:
 make run
 ```
 # Tutorial
 The motivation for building a template for deep learning pipelines is this: deep learning is hard enough without every code baase being a little different.
 Especially in a research lab, standardizing on a few components makes switching between projects easier.
 In this template, you'll see the following:
 - `src/model`, `src/config`, `storage`, `test` dirs.
 - `if __name__ == "__main__"` tests.
 - Hydra config.
 - dataloader, optimizer, criterion, device, state are constructed in main, but passed to an object that runs batches.
 - tqdm to track progress.
 - debug config flag enables lots breakpoints.
 - python type hints.
 - a `launch.sh` script to dispatch training.
 - a Makefile to install and run stuff.
 - automatic linting with the `black` package.
 - collate functions!
--- a/launch.sh
+++ b/launch.sh
@ -0,0 +1,2 @@
 python src/pipeline.py \
    debug=false
--- a/src/batch.py
+++ b/src/batch.py
@ -2,21 +2,24 @@ import torch
 from torch import nn
 from torch import optim
 from torch.utils.data import DataLoader
-from data import FashionDataset
+from data import MnistDataset
 from tqdm import tqdm
 from utils import Stage
 from omegaconf import DictConfig
 class Batch:
    def __init__(
        self,
        stage: Stage,
-        model: nn.Module, device,
+        model: nn.Module,
        device,
        loader: DataLoader,
        optimizer: optim.Optimizer,
        criterion: nn.Module,
        config: DictConfig = None,
    ):
-        """todo"""
+        self.config = config
        self.stage = stage
        self.device = device
        self.model = model.to(device)
@ -26,7 +29,11 @@ class Batch:
        self.loss = 0
    def run(self, desc):
        # set the model to train model
        if self.stage == Stage.TRAIN:
            self.model.train()
        if self.config.debug:
            breakpoint()
        epoch = 0
        for epoch, (x, y) in enumerate(tqdm(self.loader, desc=desc)):
            self.optimizer.zero_grad()
@ -34,6 +41,7 @@ class Batch:
            loss.backward()  # Send loss backwards to accumulate gradients
            self.optimizer.step()  # Perform a gradient update on the weights of the mode
            self.loss += loss.item()
        return self.loss
    def _run_batch(self, sample):
        true_x, true_y = sample
@ -47,8 +55,8 @@ def main():
    model = nn.Conv2d(1, 64, 3)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
-    path = "fashion-mnist_train.csv"
+    path = "mnist_train.csv"
-    dataset = FashionDataset(path)
+    dataset = MnistDataset(path)
    batch_size = 16
    num_workers = 1
    loader = torch.utils.data.DataLoader(
--- a/src/collate.py
+++ b/src/collate.py
@ -0,0 +1,6 @@
 from einops import rearrange
 def channel_to_batch(batch):
    """TODO"""
    return batch
--- a/src/config/main.yaml
+++ b/src/config/main.yaml
@ -0,0 +1,6 @@
 app_dir: ${hydra:runtime.cwd}
 debug: true
 lr: 2e-4
 batch_size: 16
 num_workers: 0
 device: "cpu"
--- a/src/data.py
+++ b/src/data.py
@ -3,51 +3,69 @@ import numpy as np
 import einops
 import csv
 import torch
 from pathlib import Path
 from typing import Tuple
-class FashionDataset(Dataset):
+class MnistDataset(Dataset):
-    def __init__(self, path: str):
+    """
    The MNIST database of handwritten digits.
    Training set is 60k labeled examples, test is 10k examples.
    The b/w images normalized to 20x20, preserving aspect ratio.
    It's the defacto standard image training set to learn about classification in DL
    """
    def __init__(self, path: Path):
        """
        give a path to a dir that contains the following csv files:
        https://pjreddie.com/projects/mnist-in-csv/
        """
        self.path = path
-        self.x, self.y = self.load()
+        self.features, self.labels = self.load()
    def __getitem__(self, idx):
-        return (self.x[idx], self.y[idx])
+        return (self.features[idx], self.labels[idx])
    def __len__(self):
-        return len(self.x)
+        return len(self.features)
-    def load(self):
+    def load(self) -> Tuple[torch.Tensor, torch.Tensor]:
        # opening the CSV file
        with open(self.path, mode="r") as file:
            images = list()
-            classes = list()
+            labels = list()
            # reading the CSV file
            csvFile = csv.reader(file)
            # displaying the contents of the CSV file
-            header = next(csvFile)
+            # header = next(csvFile)
            limit = 1000
            for line in csvFile:
                if limit < 1:
                    break
-                classes.append(int(line[:1][0]))
+                label = int(line[0])
-                images.append([int(x) for x in line[1:]])
+                labels.append(label)
                image = [int(x) for x in line[1:]]
                images.append(image)
                limit -= 1
-            classes = torch.tensor(classes, dtype=torch.long)
+            labels = torch.tensor(labels, dtype=torch.long)
            images = torch.tensor(images, dtype=torch.float32)
            images = einops.rearrange(images, "n (w h) -> n w h", w=28, h=28)
            images = einops.repeat(
                images, "n w h -> n c (w r_w) (h r_h)", c=1, r_w=8, r_h=8
            )
-            return (images, classes)
+            return (images, labels)
 def main():
-    path = "fashion-mnist_train.csv"
+
-    dataset = FashionDataset(path=path)
+    path = "storage/mnist_train.csv"
    dataset = MnistDataset(path=path)
    print(f"len: {len(dataset)}")
    print(f"first shape: {dataset[0][0].shape}")
-    mean = einops.reduce(dataset[:10], "n w h -> w h", "mean")
+    mean = einops.reduce(dataset[:10][0], "n w h -> w h", "mean")
    print(f"mean shape: {mean.shape}")
    print(f"mean image: {mean}")
 if __name__ == "__main__":
--- a/src/pipeline.py
+++ b/src/pipeline.py
@ -3,46 +3,54 @@ main class for building a DL pipeline.
 """
 import click
 from batch import Batch
 from model.linear import DNN
 from model.cnn import VGG16, VGG11
-from data import FashionDataset
+from data import MnistDataset
 from utils import Stage
 import torch
 from pathlib import Path
 from collate import channel_to_batch
 import hydra
 from omegaconf import DictConfig
-@click.group()
+@hydra.main(config_path="config", config_name="main")
-def cli():
+def train(config: DictConfig):
-    pass
+    if config.debug:
        breakpoint()
    lr = config.lr
    batch_size = config.batch_size
    num_workers = config.num_workers
    device = config.device
-
+    path = Path(config.app_dir) / "storage/mnist_train.csv"
-@cli.command()
+    trainset = MnistDataset(path=path)
 def train():
    batch_size = 16
    num_workers = 8
    path = "fashion-mnist_train.csv"
    trainset = FashionDataset(path=path)
    trainloader = torch.utils.data.DataLoader(
-        trainset, batch_size=batch_size, shuffle=False, num_workers=num_workers
+        trainset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        # collate_fn=channel_to_batch,
    )
    model = VGG11(in_channels=1, num_classes=10)
    criterion = torch.nn.CrossEntropyLoss()
-    optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
+    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    batch = Batch(
        stage=Stage.TRAIN,
        model=model,
-        device=torch.device("cpu"),
+        device=torch.device(device),
        loader=trainloader,
        criterion=criterion,
        optimizer=optimizer,
        config=config,
    )
-    batch.run(
+    log = batch.run(
        "Run run run run. Run run run away. Oh Oh oH OHHHHHHH yayayayayayayayaya! - David Byrne"
    )
 if __name__ == "__main__":
-    cli()
+    train()
--- a/test/init.py
+++ b/test/init.py
--- a/test/test_pipeline.py
+++ b/test/test_pipeline.py
@ -0,0 +1,10 @@
 from src.model.linear import DNN
 from src.data import GenericDataset
 import os
 def test_size_of_dataset():
    features = 40
    os.environ["INPUT_FEATURES"] = str(features)
    dataset = GenericDataset()
    assert len(dataset[0][0]) == features