add hydra config.
remove click. add launch script. add test dir. switch from fashion mnist to generic.
This commit is contained in:
parent
404e39206b
commit
1f13224c4f
|
@ -1,2 +1,4 @@
|
|||
storage/
|
||||
__pycache__/
|
||||
outputs/
|
||||
.env
|
||||
|
|
2
Makefile
2
Makefile
|
@ -3,7 +3,7 @@ CONDA_ENV=ml_pipeline
|
|||
all: run
|
||||
|
||||
run:
|
||||
python src/pipeline.py train
|
||||
./launch.sh
|
||||
|
||||
data:
|
||||
python src/data.py
|
||||
|
|
26
README.md
26
README.md
|
@ -7,9 +7,9 @@ Instead of remembering where to put everything and making a different choice for
|
|||
Think of it like a mini-pytorch lightening, with all the fory internals exposed for extension and modification.
|
||||
|
||||
|
||||
## Usage
|
||||
# Usage
|
||||
|
||||
### Install:
|
||||
## Install:
|
||||
|
||||
Install the conda requirements:
|
||||
|
||||
|
@ -23,7 +23,7 @@ Which is a proxy for calling:
|
|||
conda env updates -n ml_pipeline --file environment.yml
|
||||
```
|
||||
|
||||
### Run:
|
||||
## Run:
|
||||
|
||||
Run the code on MNIST with the following command:
|
||||
|
||||
|
@ -31,3 +31,23 @@ Run the code on MNIST with the following command:
|
|||
make run
|
||||
```
|
||||
|
||||
|
||||
# Tutorial
|
||||
|
||||
The motivation for building a template for deep learning pipelines is this: deep learning is hard enough without every code baase being a little different.
|
||||
|
||||
Especially in a research lab, standardizing on a few components makes switching between projects easier.
|
||||
|
||||
In this template, you'll see the following:
|
||||
|
||||
- `src/model`, `src/config`, `storage`, `test` dirs.
|
||||
- `if __name__ == "__main__"` tests.
|
||||
- Hydra config.
|
||||
- dataloader, optimizer, criterion, device, state are constructed in main, but passed to an object that runs batches.
|
||||
- tqdm to track progress.
|
||||
- debug config flag enables lots breakpoints.
|
||||
- python type hints.
|
||||
- a `launch.sh` script to dispatch training.
|
||||
- a Makefile to install and run stuff.
|
||||
- automatic linting with the `black` package.
|
||||
- collate functions!
|
||||
|
|
18
src/batch.py
18
src/batch.py
|
@ -2,21 +2,24 @@ import torch
|
|||
from torch import nn
|
||||
from torch import optim
|
||||
from torch.utils.data import DataLoader
|
||||
from data import FashionDataset
|
||||
from data import MnistDataset
|
||||
from tqdm import tqdm
|
||||
from utils import Stage
|
||||
from omegaconf import DictConfig
|
||||
|
||||
|
||||
class Batch:
|
||||
def __init__(
|
||||
self,
|
||||
stage: Stage,
|
||||
model: nn.Module, device,
|
||||
model: nn.Module,
|
||||
device,
|
||||
loader: DataLoader,
|
||||
optimizer: optim.Optimizer,
|
||||
criterion: nn.Module,
|
||||
config: DictConfig = None,
|
||||
):
|
||||
"""todo"""
|
||||
self.config = config
|
||||
self.stage = stage
|
||||
self.device = device
|
||||
self.model = model.to(device)
|
||||
|
@ -26,7 +29,11 @@ class Batch:
|
|||
self.loss = 0
|
||||
|
||||
def run(self, desc):
|
||||
# set the model to train model
|
||||
if self.stage == Stage.TRAIN:
|
||||
self.model.train()
|
||||
if self.config.debug:
|
||||
breakpoint()
|
||||
epoch = 0
|
||||
for epoch, (x, y) in enumerate(tqdm(self.loader, desc=desc)):
|
||||
self.optimizer.zero_grad()
|
||||
|
@ -34,6 +41,7 @@ class Batch:
|
|||
loss.backward() # Send loss backwards to accumulate gradients
|
||||
self.optimizer.step() # Perform a gradient update on the weights of the mode
|
||||
self.loss += loss.item()
|
||||
return self.loss
|
||||
|
||||
def _run_batch(self, sample):
|
||||
true_x, true_y = sample
|
||||
|
@ -47,8 +55,8 @@ def main():
|
|||
model = nn.Conv2d(1, 64, 3)
|
||||
criterion = torch.nn.CrossEntropyLoss()
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
|
||||
path = "fashion-mnist_train.csv"
|
||||
dataset = FashionDataset(path)
|
||||
path = "mnist_train.csv"
|
||||
dataset = MnistDataset(path)
|
||||
batch_size = 16
|
||||
num_workers = 1
|
||||
loader = torch.utils.data.DataLoader(
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
from einops import rearrange
|
||||
|
||||
|
||||
def channel_to_batch(batch):
|
||||
"""TODO"""
|
||||
return batch
|
|
@ -0,0 +1,6 @@
|
|||
app_dir: ${hydra:runtime.cwd}
|
||||
debug: true
|
||||
lr: 2e-4
|
||||
batch_size: 16
|
||||
num_workers: 0
|
||||
device: "cpu"
|
48
src/data.py
48
src/data.py
|
@ -3,51 +3,69 @@ import numpy as np
|
|||
import einops
|
||||
import csv
|
||||
import torch
|
||||
from pathlib import Path
|
||||
from typing import Tuple
|
||||
|
||||
|
||||
class FashionDataset(Dataset):
|
||||
def __init__(self, path: str):
|
||||
class MnistDataset(Dataset):
|
||||
"""
|
||||
The MNIST database of handwritten digits.
|
||||
Training set is 60k labeled examples, test is 10k examples.
|
||||
The b/w images normalized to 20x20, preserving aspect ratio.
|
||||
|
||||
It's the defacto standard image training set to learn about classification in DL
|
||||
"""
|
||||
|
||||
def __init__(self, path: Path):
|
||||
"""
|
||||
give a path to a dir that contains the following csv files:
|
||||
https://pjreddie.com/projects/mnist-in-csv/
|
||||
"""
|
||||
self.path = path
|
||||
self.x, self.y = self.load()
|
||||
self.features, self.labels = self.load()
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return (self.x[idx], self.y[idx])
|
||||
return (self.features[idx], self.labels[idx])
|
||||
|
||||
def __len__(self):
|
||||
return len(self.x)
|
||||
return len(self.features)
|
||||
|
||||
def load(self):
|
||||
def load(self) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
# opening the CSV file
|
||||
with open(self.path, mode="r") as file:
|
||||
images = list()
|
||||
classes = list()
|
||||
labels = list()
|
||||
# reading the CSV file
|
||||
csvFile = csv.reader(file)
|
||||
# displaying the contents of the CSV file
|
||||
header = next(csvFile)
|
||||
# header = next(csvFile)
|
||||
limit = 1000
|
||||
for line in csvFile:
|
||||
if limit < 1:
|
||||
break
|
||||
classes.append(int(line[:1][0]))
|
||||
images.append([int(x) for x in line[1:]])
|
||||
label = int(line[0])
|
||||
labels.append(label)
|
||||
image = [int(x) for x in line[1:]]
|
||||
images.append(image)
|
||||
limit -= 1
|
||||
classes = torch.tensor(classes, dtype=torch.long)
|
||||
labels = torch.tensor(labels, dtype=torch.long)
|
||||
images = torch.tensor(images, dtype=torch.float32)
|
||||
images = einops.rearrange(images, "n (w h) -> n w h", w=28, h=28)
|
||||
images = einops.repeat(
|
||||
images, "n w h -> n c (w r_w) (h r_h)", c=1, r_w=8, r_h=8
|
||||
)
|
||||
return (images, classes)
|
||||
return (images, labels)
|
||||
|
||||
|
||||
def main():
|
||||
path = "fashion-mnist_train.csv"
|
||||
dataset = FashionDataset(path=path)
|
||||
|
||||
path = "storage/mnist_train.csv"
|
||||
dataset = MnistDataset(path=path)
|
||||
print(f"len: {len(dataset)}")
|
||||
print(f"first shape: {dataset[0][0].shape}")
|
||||
mean = einops.reduce(dataset[:10], "n w h -> w h", "mean")
|
||||
mean = einops.reduce(dataset[:10][0], "n w h -> w h", "mean")
|
||||
print(f"mean shape: {mean.shape}")
|
||||
print(f"mean image: {mean}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -3,46 +3,54 @@ main class for building a DL pipeline.
|
|||
|
||||
"""
|
||||
|
||||
import click
|
||||
from batch import Batch
|
||||
from model.linear import DNN
|
||||
from model.cnn import VGG16, VGG11
|
||||
from data import FashionDataset
|
||||
from data import MnistDataset
|
||||
from utils import Stage
|
||||
import torch
|
||||
from pathlib import Path
|
||||
from collate import channel_to_batch
|
||||
|
||||
import hydra
|
||||
from omegaconf import DictConfig
|
||||
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
pass
|
||||
@hydra.main(config_path="config", config_name="main")
|
||||
def train(config: DictConfig):
|
||||
if config.debug:
|
||||
breakpoint()
|
||||
lr = config.lr
|
||||
batch_size = config.batch_size
|
||||
num_workers = config.num_workers
|
||||
device = config.device
|
||||
|
||||
|
||||
@cli.command()
|
||||
def train():
|
||||
batch_size = 16
|
||||
num_workers = 8
|
||||
|
||||
path = "fashion-mnist_train.csv"
|
||||
trainset = FashionDataset(path=path)
|
||||
path = Path(config.app_dir) / "storage/mnist_train.csv"
|
||||
trainset = MnistDataset(path=path)
|
||||
|
||||
trainloader = torch.utils.data.DataLoader(
|
||||
trainset, batch_size=batch_size, shuffle=False, num_workers=num_workers
|
||||
trainset,
|
||||
batch_size=batch_size,
|
||||
shuffle=False,
|
||||
num_workers=num_workers,
|
||||
# collate_fn=channel_to_batch,
|
||||
)
|
||||
model = VGG11(in_channels=1, num_classes=10)
|
||||
criterion = torch.nn.CrossEntropyLoss()
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
|
||||
batch = Batch(
|
||||
stage=Stage.TRAIN,
|
||||
model=model,
|
||||
device=torch.device("cpu"),
|
||||
device=torch.device(device),
|
||||
loader=trainloader,
|
||||
criterion=criterion,
|
||||
optimizer=optimizer,
|
||||
config=config,
|
||||
)
|
||||
batch.run(
|
||||
log = batch.run(
|
||||
"Run run run run. Run run run away. Oh Oh oH OHHHHHHH yayayayayayayayaya! - David Byrne"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
train()
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
from src.model.linear import DNN
|
||||
from src.data import GenericDataset
|
||||
import os
|
||||
|
||||
|
||||
def test_size_of_dataset():
|
||||
features = 40
|
||||
os.environ["INPUT_FEATURES"] = str(features)
|
||||
dataset = GenericDataset()
|
||||
assert len(dataset[0][0]) == features
|
Loading…
Reference in New Issue