add hydra config.

remove click.
add launch script.
add test dir.
switch from fashion mnist to generic.
This commit is contained in:
Matt 2023-01-26 07:25:07 -08:00
parent 404e39206b
commit 1f13224c4f
11 changed files with 123 additions and 43 deletions

2
.gitignore vendored
View File

@ -1,2 +1,4 @@
storage/
__pycache__/
outputs/
.env

View File

@ -3,7 +3,7 @@ CONDA_ENV=ml_pipeline
all: run
run:
python src/pipeline.py train
./launch.sh
data:
python src/data.py

View File

@ -7,9 +7,9 @@ Instead of remembering where to put everything and making a different choice for
Think of it like a mini-pytorch lightening, with all the fory internals exposed for extension and modification.
## Usage
# Usage
### Install:
## Install:
Install the conda requirements:
@ -23,7 +23,7 @@ Which is a proxy for calling:
conda env updates -n ml_pipeline --file environment.yml
```
### Run:
## Run:
Run the code on MNIST with the following command:
@ -31,3 +31,23 @@ Run the code on MNIST with the following command:
make run
```
# Tutorial
The motivation for building a template for deep learning pipelines is this: deep learning is hard enough without every code baase being a little different.
Especially in a research lab, standardizing on a few components makes switching between projects easier.
In this template, you'll see the following:
- `src/model`, `src/config`, `storage`, `test` dirs.
- `if __name__ == "__main__"` tests.
- Hydra config.
- dataloader, optimizer, criterion, device, state are constructed in main, but passed to an object that runs batches.
- tqdm to track progress.
- debug config flag enables lots breakpoints.
- python type hints.
- a `launch.sh` script to dispatch training.
- a Makefile to install and run stuff.
- automatic linting with the `black` package.
- collate functions!

2
launch.sh Executable file
View File

@ -0,0 +1,2 @@
python src/pipeline.py \
debug=false

View File

@ -2,21 +2,24 @@ import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
from data import FashionDataset
from data import MnistDataset
from tqdm import tqdm
from utils import Stage
from omegaconf import DictConfig
class Batch:
def __init__(
self,
stage: Stage,
model: nn.Module, device,
model: nn.Module,
device,
loader: DataLoader,
optimizer: optim.Optimizer,
criterion: nn.Module,
config: DictConfig = None,
):
"""todo"""
self.config = config
self.stage = stage
self.device = device
self.model = model.to(device)
@ -26,7 +29,11 @@ class Batch:
self.loss = 0
def run(self, desc):
# set the model to train model
if self.stage == Stage.TRAIN:
self.model.train()
if self.config.debug:
breakpoint()
epoch = 0
for epoch, (x, y) in enumerate(tqdm(self.loader, desc=desc)):
self.optimizer.zero_grad()
@ -34,6 +41,7 @@ class Batch:
loss.backward() # Send loss backwards to accumulate gradients
self.optimizer.step() # Perform a gradient update on the weights of the mode
self.loss += loss.item()
return self.loss
def _run_batch(self, sample):
true_x, true_y = sample
@ -47,8 +55,8 @@ def main():
model = nn.Conv2d(1, 64, 3)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
path = "fashion-mnist_train.csv"
dataset = FashionDataset(path)
path = "mnist_train.csv"
dataset = MnistDataset(path)
batch_size = 16
num_workers = 1
loader = torch.utils.data.DataLoader(

6
src/collate.py Normal file
View File

@ -0,0 +1,6 @@
from einops import rearrange
def channel_to_batch(batch):
"""TODO"""
return batch

6
src/config/main.yaml Normal file
View File

@ -0,0 +1,6 @@
app_dir: ${hydra:runtime.cwd}
debug: true
lr: 2e-4
batch_size: 16
num_workers: 0
device: "cpu"

View File

@ -3,51 +3,69 @@ import numpy as np
import einops
import csv
import torch
from pathlib import Path
from typing import Tuple
class FashionDataset(Dataset):
def __init__(self, path: str):
class MnistDataset(Dataset):
"""
The MNIST database of handwritten digits.
Training set is 60k labeled examples, test is 10k examples.
The b/w images normalized to 20x20, preserving aspect ratio.
It's the defacto standard image training set to learn about classification in DL
"""
def __init__(self, path: Path):
"""
give a path to a dir that contains the following csv files:
https://pjreddie.com/projects/mnist-in-csv/
"""
self.path = path
self.x, self.y = self.load()
self.features, self.labels = self.load()
def __getitem__(self, idx):
return (self.x[idx], self.y[idx])
return (self.features[idx], self.labels[idx])
def __len__(self):
return len(self.x)
return len(self.features)
def load(self):
def load(self) -> Tuple[torch.Tensor, torch.Tensor]:
# opening the CSV file
with open(self.path, mode="r") as file:
images = list()
classes = list()
labels = list()
# reading the CSV file
csvFile = csv.reader(file)
# displaying the contents of the CSV file
header = next(csvFile)
# header = next(csvFile)
limit = 1000
for line in csvFile:
if limit < 1:
break
classes.append(int(line[:1][0]))
images.append([int(x) for x in line[1:]])
label = int(line[0])
labels.append(label)
image = [int(x) for x in line[1:]]
images.append(image)
limit -= 1
classes = torch.tensor(classes, dtype=torch.long)
labels = torch.tensor(labels, dtype=torch.long)
images = torch.tensor(images, dtype=torch.float32)
images = einops.rearrange(images, "n (w h) -> n w h", w=28, h=28)
images = einops.repeat(
images, "n w h -> n c (w r_w) (h r_h)", c=1, r_w=8, r_h=8
)
return (images, classes)
return (images, labels)
def main():
path = "fashion-mnist_train.csv"
dataset = FashionDataset(path=path)
path = "storage/mnist_train.csv"
dataset = MnistDataset(path=path)
print(f"len: {len(dataset)}")
print(f"first shape: {dataset[0][0].shape}")
mean = einops.reduce(dataset[:10], "n w h -> w h", "mean")
mean = einops.reduce(dataset[:10][0], "n w h -> w h", "mean")
print(f"mean shape: {mean.shape}")
print(f"mean image: {mean}")
if __name__ == "__main__":

View File

@ -3,46 +3,54 @@ main class for building a DL pipeline.
"""
import click
from batch import Batch
from model.linear import DNN
from model.cnn import VGG16, VGG11
from data import FashionDataset
from data import MnistDataset
from utils import Stage
import torch
from pathlib import Path
from collate import channel_to_batch
import hydra
from omegaconf import DictConfig
@click.group()
def cli():
pass
@hydra.main(config_path="config", config_name="main")
def train(config: DictConfig):
if config.debug:
breakpoint()
lr = config.lr
batch_size = config.batch_size
num_workers = config.num_workers
device = config.device
@cli.command()
def train():
batch_size = 16
num_workers = 8
path = "fashion-mnist_train.csv"
trainset = FashionDataset(path=path)
path = Path(config.app_dir) / "storage/mnist_train.csv"
trainset = MnistDataset(path=path)
trainloader = torch.utils.data.DataLoader(
trainset, batch_size=batch_size, shuffle=False, num_workers=num_workers
trainset,
batch_size=batch_size,
shuffle=False,
num_workers=num_workers,
# collate_fn=channel_to_batch,
)
model = VGG11(in_channels=1, num_classes=10)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
batch = Batch(
stage=Stage.TRAIN,
model=model,
device=torch.device("cpu"),
device=torch.device(device),
loader=trainloader,
criterion=criterion,
optimizer=optimizer,
config=config,
)
batch.run(
log = batch.run(
"Run run run run. Run run run away. Oh Oh oH OHHHHHHH yayayayayayayayaya! - David Byrne"
)
if __name__ == "__main__":
cli()
train()

0
test/__init__.py Normal file
View File

10
test/test_pipeline.py Normal file
View File

@ -0,0 +1,10 @@
from src.model.linear import DNN
from src.data import GenericDataset
import os
def test_size_of_dataset():
features = 40
os.environ["INPUT_FEATURES"] = str(features)
dataset = GenericDataset()
assert len(dataset[0][0]) == features