add hydra config.

remove click. add launch script. add test dir. switch from fashion mnist to generic.
2023-01-26 07:25:07 -08:00 · 2023-01-26 07:25:07 -08:00 · 1f13224c4f
parent 404e39206b
commit 1f13224c4f
11 changed files with 123 additions and 43 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,4 @@
 storage/
 __pycache__/
+outputs/
+.env
--- a/2
+++ b/2
@ -3,7 +3,7 @@ CONDA_ENV=ml_pipeline
 all: run

 run:
-	python src/pipeline.py train
+	./launch.sh

 data:
 	python src/data.py
--- a/README.md
+++ b/README.md
@ -7,9 +7,9 @@ Instead of remembering where to put everything and making a different choice for
 Think of it like a mini-pytorch lightening, with all the fory internals exposed for extension and modification.


-## Usage
+# Usage

-### Install:
+## Install:

 Install the conda requirements:

@ -23,7 +23,7 @@ Which is a proxy for calling:
 conda env updates -n ml_pipeline --file environment.yml
 ```

-### Run:
+## Run:

 Run the code on MNIST with the following command:

@ -31,3 +31,23 @@ Run the code on MNIST with the following command:
 make run
 ```

+
+# Tutorial
+
+The motivation for building a template for deep learning pipelines is this: deep learning is hard enough without every code baase being a little different.
+
+Especially in a research lab, standardizing on a few components makes switching between projects easier.
+
+In this template, you'll see the following:
+
+- `src/model`, `src/config`, `storage`, `test` dirs.
+- `if __name__ == "__main__"` tests.
+- Hydra config.
+- dataloader, optimizer, criterion, device, state are constructed in main, but passed to an object that runs batches.
+- tqdm to track progress.
+- debug config flag enables lots breakpoints.
+- python type hints.
+- a `launch.sh` script to dispatch training.
+- a Makefile to install and run stuff.
+- automatic linting with the `black` package.
+- collate functions!
--- a/launch.sh
+++ b/launch.sh
@ -0,0 +1,2 @@
+python src/pipeline.py \
+    debug=false
--- a/src/batch.py
+++ b/src/batch.py
@ -2,21 +2,24 @@ import torch
 from torch import nn
 from torch import optim
 from torch.utils.data import DataLoader
-from data import FashionDataset
+from data import MnistDataset
 from tqdm import tqdm
 from utils import Stage
+from omegaconf import DictConfig


 class Batch:
    def __init__(
        self,
        stage: Stage,
-        model: nn.Module, device,
+        model: nn.Module,
+        device,
        loader: DataLoader,
        optimizer: optim.Optimizer,
        criterion: nn.Module,
+        config: DictConfig = None,
    ):
-        """todo"""
+        self.config = config
        self.stage = stage
        self.device = device
        self.model = model.to(device)
@ -26,7 +29,11 @@ class Batch:
        self.loss = 0

    def run(self, desc):
+        # set the model to train model
+        if self.stage == Stage.TRAIN:
            self.model.train()
+        if self.config.debug:
+            breakpoint()
        epoch = 0
        for epoch, (x, y) in enumerate(tqdm(self.loader, desc=desc)):
            self.optimizer.zero_grad()
@ -34,6 +41,7 @@ class Batch:
            loss.backward()  # Send loss backwards to accumulate gradients
            self.optimizer.step()  # Perform a gradient update on the weights of the mode
            self.loss += loss.item()
+        return self.loss

    def _run_batch(self, sample):
        true_x, true_y = sample
@ -47,8 +55,8 @@ def main():
    model = nn.Conv2d(1, 64, 3)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
-    path = "fashion-mnist_train.csv"
-    dataset = FashionDataset(path)
+    path = "mnist_train.csv"
+    dataset = MnistDataset(path)
    batch_size = 16
    num_workers = 1
    loader = torch.utils.data.DataLoader(
--- a/src/collate.py
+++ b/src/collate.py
@ -0,0 +1,6 @@
+from einops import rearrange
+
+
+def channel_to_batch(batch):
+    """TODO"""
+    return batch
--- a/src/config/main.yaml
+++ b/src/config/main.yaml
@ -0,0 +1,6 @@
+app_dir: ${hydra:runtime.cwd}
+debug: true
+lr: 2e-4
+batch_size: 16
+num_workers: 0
+device: "cpu"
--- a/src/data.py
+++ b/src/data.py
@ -3,51 +3,69 @@ import numpy as np
 import einops
 import csv
 import torch
+from pathlib import Path
+from typing import Tuple


-class FashionDataset(Dataset):
-    def __init__(self, path: str):
+class MnistDataset(Dataset):
+    """
+    The MNIST database of handwritten digits.
+    Training set is 60k labeled examples, test is 10k examples.
+    The b/w images normalized to 20x20, preserving aspect ratio.
+
+    It's the defacto standard image training set to learn about classification in DL
+    """
+
+    def __init__(self, path: Path):
+        """
+        give a path to a dir that contains the following csv files:
+        https://pjreddie.com/projects/mnist-in-csv/
+        """
        self.path = path
-        self.x, self.y = self.load()
+        self.features, self.labels = self.load()

    def __getitem__(self, idx):
-        return (self.x[idx], self.y[idx])
+        return (self.features[idx], self.labels[idx])

    def __len__(self):
-        return len(self.x)
+        return len(self.features)

-    def load(self):
+    def load(self) -> Tuple[torch.Tensor, torch.Tensor]:
        # opening the CSV file
        with open(self.path, mode="r") as file:
            images = list()
-            classes = list()
+            labels = list()
            # reading the CSV file
            csvFile = csv.reader(file)
            # displaying the contents of the CSV file
-            header = next(csvFile)
+            # header = next(csvFile)
            limit = 1000
            for line in csvFile:
                if limit < 1:
                    break
-                classes.append(int(line[:1][0]))
-                images.append([int(x) for x in line[1:]])
+                label = int(line[0])
+                labels.append(label)
+                image = [int(x) for x in line[1:]]
+                images.append(image)
                limit -= 1
-            classes = torch.tensor(classes, dtype=torch.long)
+            labels = torch.tensor(labels, dtype=torch.long)
            images = torch.tensor(images, dtype=torch.float32)
            images = einops.rearrange(images, "n (w h) -> n w h", w=28, h=28)
            images = einops.repeat(
                images, "n w h -> n c (w r_w) (h r_h)", c=1, r_w=8, r_h=8
            )
-            return (images, classes)
+            return (images, labels)


 def main():
-    path = "fashion-mnist_train.csv"
-    dataset = FashionDataset(path=path)
+
+    path = "storage/mnist_train.csv"
+    dataset = MnistDataset(path=path)
    print(f"len: {len(dataset)}")
    print(f"first shape: {dataset[0][0].shape}")
-    mean = einops.reduce(dataset[:10], "n w h -> w h", "mean")
+    mean = einops.reduce(dataset[:10][0], "n w h -> w h", "mean")
    print(f"mean shape: {mean.shape}")
+    print(f"mean image: {mean}")


 if __name__ == "__main__":
--- a/src/pipeline.py
+++ b/src/pipeline.py
@ -3,46 +3,54 @@ main class for building a DL pipeline.

 """

-import click
 from batch import Batch
 from model.linear import DNN
 from model.cnn import VGG16, VGG11
-from data import FashionDataset
+from data import MnistDataset
 from utils import Stage
 import torch
+from pathlib import Path
+from collate import channel_to_batch
+
+import hydra
+from omegaconf import DictConfig


-@click.group()
-def cli():
-    pass
+@hydra.main(config_path="config", config_name="main")
+def train(config: DictConfig):
+    if config.debug:
+        breakpoint()
+    lr = config.lr
+    batch_size = config.batch_size
+    num_workers = config.num_workers
+    device = config.device

-
-@cli.command()
-def train():
-    batch_size = 16
-    num_workers = 8
-
-    path = "fashion-mnist_train.csv"
-    trainset = FashionDataset(path=path)
+    path = Path(config.app_dir) / "storage/mnist_train.csv"
+    trainset = MnistDataset(path=path)

    trainloader = torch.utils.data.DataLoader(
-        trainset, batch_size=batch_size, shuffle=False, num_workers=num_workers
+        trainset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=num_workers,
+        # collate_fn=channel_to_batch,
    )
    model = VGG11(in_channels=1, num_classes=10)
    criterion = torch.nn.CrossEntropyLoss()
-    optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
+    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    batch = Batch(
        stage=Stage.TRAIN,
        model=model,
-        device=torch.device("cpu"),
+        device=torch.device(device),
        loader=trainloader,
        criterion=criterion,
        optimizer=optimizer,
+        config=config,
    )
-    batch.run(
+    log = batch.run(
        "Run run run run. Run run run away. Oh Oh oH OHHHHHHH yayayayayayayayaya! - David Byrne"
    )


 if __name__ == "__main__":
-    cli()
+    train()
--- a/test/init.py
+++ b/test/init.py
--- a/test/test_pipeline.py
+++ b/test/test_pipeline.py
@ -0,0 +1,10 @@
+from src.model.linear import DNN
+from src.data import GenericDataset
+import os
+
+
+def test_size_of_dataset():
+    features = 40
+    os.environ["INPUT_FEATURES"] = str(features)
+    dataset = GenericDataset()
+    assert len(dataset[0][0]) == features