add hydra config.
remove click. add launch script. add test dir. switch from fashion mnist to generic.
This commit is contained in:
		
							parent
							
								
									404e39206b
								
							
						
					
					
						commit
						1f13224c4f
					
				|  | @ -1,2 +1,4 @@ | |||
| storage/ | ||||
| __pycache__/ | ||||
| outputs/ | ||||
| .env | ||||
|  |  | |||
							
								
								
									
										2
									
								
								Makefile
								
								
								
								
							
							
						
						
									
										2
									
								
								Makefile
								
								
								
								
							|  | @ -3,7 +3,7 @@ CONDA_ENV=ml_pipeline | |||
| all: run | ||||
| 
 | ||||
| run: | ||||
| 	python src/pipeline.py train | ||||
| 	./launch.sh | ||||
| 
 | ||||
| data: | ||||
| 	python src/data.py | ||||
|  |  | |||
							
								
								
									
										26
									
								
								README.md
								
								
								
								
							
							
						
						
									
										26
									
								
								README.md
								
								
								
								
							|  | @ -7,9 +7,9 @@ Instead of remembering where to put everything and making a different choice for | |||
| Think of it like a mini-pytorch lightening, with all the fory internals exposed for extension and modification. | ||||
| 
 | ||||
| 
 | ||||
| ## Usage | ||||
| # Usage | ||||
| 
 | ||||
| ### Install: | ||||
| ## Install: | ||||
| 
 | ||||
| Install the conda requirements: | ||||
| 
 | ||||
|  | @ -23,7 +23,7 @@ Which is a proxy for calling: | |||
| conda env updates -n ml_pipeline --file environment.yml | ||||
| ``` | ||||
| 
 | ||||
| ### Run: | ||||
| ## Run: | ||||
| 
 | ||||
| Run the code on MNIST with the following command: | ||||
| 
 | ||||
|  | @ -31,3 +31,23 @@ Run the code on MNIST with the following command: | |||
| make run | ||||
| ``` | ||||
| 
 | ||||
| 
 | ||||
| # Tutorial | ||||
| 
 | ||||
| The motivation for building a template for deep learning pipelines is this: deep learning is hard enough without every code baase being a little different. | ||||
| 
 | ||||
| Especially in a research lab, standardizing on a few components makes switching between projects easier. | ||||
| 
 | ||||
| In this template, you'll see the following: | ||||
| 
 | ||||
| - `src/model`, `src/config`, `storage`, `test` dirs. | ||||
| - `if __name__ == "__main__"` tests. | ||||
| - Hydra config. | ||||
| - dataloader, optimizer, criterion, device, state are constructed in main, but passed to an object that runs batches. | ||||
| - tqdm to track progress. | ||||
| - debug config flag enables lots breakpoints. | ||||
| - python type hints. | ||||
| - a `launch.sh` script to dispatch training. | ||||
| - a Makefile to install and run stuff. | ||||
| - automatic linting with the `black` package. | ||||
| - collate functions! | ||||
|  |  | |||
							
								
								
									
										18
									
								
								src/batch.py
								
								
								
								
							
							
						
						
									
										18
									
								
								src/batch.py
								
								
								
								
							|  | @ -2,21 +2,24 @@ import torch | |||
| from torch import nn | ||||
| from torch import optim | ||||
| from torch.utils.data import DataLoader | ||||
| from data import FashionDataset | ||||
| from data import MnistDataset | ||||
| from tqdm import tqdm | ||||
| from utils import Stage | ||||
| from omegaconf import DictConfig | ||||
| 
 | ||||
| 
 | ||||
| class Batch: | ||||
|     def __init__( | ||||
|         self, | ||||
|         stage: Stage, | ||||
|         model: nn.Module, device, | ||||
|         model: nn.Module, | ||||
|         device, | ||||
|         loader: DataLoader, | ||||
|         optimizer: optim.Optimizer, | ||||
|         criterion: nn.Module, | ||||
|         config: DictConfig = None, | ||||
|     ): | ||||
|         """todo""" | ||||
|         self.config = config | ||||
|         self.stage = stage | ||||
|         self.device = device | ||||
|         self.model = model.to(device) | ||||
|  | @ -26,7 +29,11 @@ class Batch: | |||
|         self.loss = 0 | ||||
| 
 | ||||
|     def run(self, desc): | ||||
|         # set the model to train model | ||||
|         if self.stage == Stage.TRAIN: | ||||
|             self.model.train() | ||||
|         if self.config.debug: | ||||
|             breakpoint() | ||||
|         epoch = 0 | ||||
|         for epoch, (x, y) in enumerate(tqdm(self.loader, desc=desc)): | ||||
|             self.optimizer.zero_grad() | ||||
|  | @ -34,6 +41,7 @@ class Batch: | |||
|             loss.backward()  # Send loss backwards to accumulate gradients | ||||
|             self.optimizer.step()  # Perform a gradient update on the weights of the mode | ||||
|             self.loss += loss.item() | ||||
|         return self.loss | ||||
| 
 | ||||
|     def _run_batch(self, sample): | ||||
|         true_x, true_y = sample | ||||
|  | @ -47,8 +55,8 @@ def main(): | |||
|     model = nn.Conv2d(1, 64, 3) | ||||
|     criterion = torch.nn.CrossEntropyLoss() | ||||
|     optimizer = torch.optim.Adam(model.parameters(), lr=2e-4) | ||||
|     path = "fashion-mnist_train.csv" | ||||
|     dataset = FashionDataset(path) | ||||
|     path = "mnist_train.csv" | ||||
|     dataset = MnistDataset(path) | ||||
|     batch_size = 16 | ||||
|     num_workers = 1 | ||||
|     loader = torch.utils.data.DataLoader( | ||||
|  |  | |||
|  | @ -0,0 +1,6 @@ | |||
| from einops import rearrange | ||||
| 
 | ||||
| 
 | ||||
| def channel_to_batch(batch): | ||||
|     """TODO""" | ||||
|     return batch | ||||
|  | @ -0,0 +1,6 @@ | |||
| app_dir: ${hydra:runtime.cwd} | ||||
| debug: true | ||||
| lr: 2e-4 | ||||
| batch_size: 16 | ||||
| num_workers: 0 | ||||
| device: "cpu" | ||||
							
								
								
									
										48
									
								
								src/data.py
								
								
								
								
							
							
						
						
									
										48
									
								
								src/data.py
								
								
								
								
							|  | @ -3,51 +3,69 @@ import numpy as np | |||
| import einops | ||||
| import csv | ||||
| import torch | ||||
| from pathlib import Path | ||||
| from typing import Tuple | ||||
| 
 | ||||
| 
 | ||||
| class FashionDataset(Dataset): | ||||
|     def __init__(self, path: str): | ||||
| class MnistDataset(Dataset): | ||||
|     """ | ||||
|     The MNIST database of handwritten digits. | ||||
|     Training set is 60k labeled examples, test is 10k examples. | ||||
|     The b/w images normalized to 20x20, preserving aspect ratio. | ||||
| 
 | ||||
|     It's the defacto standard image training set to learn about classification in DL | ||||
|     """ | ||||
| 
 | ||||
|     def __init__(self, path: Path): | ||||
|         """ | ||||
|         give a path to a dir that contains the following csv files: | ||||
|         https://pjreddie.com/projects/mnist-in-csv/ | ||||
|         """ | ||||
|         self.path = path | ||||
|         self.x, self.y = self.load() | ||||
|         self.features, self.labels = self.load() | ||||
| 
 | ||||
|     def __getitem__(self, idx): | ||||
|         return (self.x[idx], self.y[idx]) | ||||
|         return (self.features[idx], self.labels[idx]) | ||||
| 
 | ||||
|     def __len__(self): | ||||
|         return len(self.x) | ||||
|         return len(self.features) | ||||
| 
 | ||||
|     def load(self): | ||||
|     def load(self) -> Tuple[torch.Tensor, torch.Tensor]: | ||||
|         # opening the CSV file | ||||
|         with open(self.path, mode="r") as file: | ||||
|             images = list() | ||||
|             classes = list() | ||||
|             labels = list() | ||||
|             # reading the CSV file | ||||
|             csvFile = csv.reader(file) | ||||
|             # displaying the contents of the CSV file | ||||
|             header = next(csvFile) | ||||
|             # header = next(csvFile) | ||||
|             limit = 1000 | ||||
|             for line in csvFile: | ||||
|                 if limit < 1: | ||||
|                     break | ||||
|                 classes.append(int(line[:1][0])) | ||||
|                 images.append([int(x) for x in line[1:]]) | ||||
|                 label = int(line[0]) | ||||
|                 labels.append(label) | ||||
|                 image = [int(x) for x in line[1:]] | ||||
|                 images.append(image) | ||||
|                 limit -= 1 | ||||
|             classes = torch.tensor(classes, dtype=torch.long) | ||||
|             labels = torch.tensor(labels, dtype=torch.long) | ||||
|             images = torch.tensor(images, dtype=torch.float32) | ||||
|             images = einops.rearrange(images, "n (w h) -> n w h", w=28, h=28) | ||||
|             images = einops.repeat( | ||||
|                 images, "n w h -> n c (w r_w) (h r_h)", c=1, r_w=8, r_h=8 | ||||
|             ) | ||||
|             return (images, classes) | ||||
|             return (images, labels) | ||||
| 
 | ||||
| 
 | ||||
| def main(): | ||||
|     path = "fashion-mnist_train.csv" | ||||
|     dataset = FashionDataset(path=path) | ||||
| 
 | ||||
|     path = "storage/mnist_train.csv" | ||||
|     dataset = MnistDataset(path=path) | ||||
|     print(f"len: {len(dataset)}") | ||||
|     print(f"first shape: {dataset[0][0].shape}") | ||||
|     mean = einops.reduce(dataset[:10], "n w h -> w h", "mean") | ||||
|     mean = einops.reduce(dataset[:10][0], "n w h -> w h", "mean") | ||||
|     print(f"mean shape: {mean.shape}") | ||||
|     print(f"mean image: {mean}") | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|  |  | |||
|  | @ -3,46 +3,54 @@ main class for building a DL pipeline. | |||
| 
 | ||||
| """ | ||||
| 
 | ||||
| import click | ||||
| from batch import Batch | ||||
| from model.linear import DNN | ||||
| from model.cnn import VGG16, VGG11 | ||||
| from data import FashionDataset | ||||
| from data import MnistDataset | ||||
| from utils import Stage | ||||
| import torch | ||||
| from pathlib import Path | ||||
| from collate import channel_to_batch | ||||
| 
 | ||||
| import hydra | ||||
| from omegaconf import DictConfig | ||||
| 
 | ||||
| 
 | ||||
| @click.group() | ||||
| def cli(): | ||||
|     pass | ||||
| @hydra.main(config_path="config", config_name="main") | ||||
| def train(config: DictConfig): | ||||
|     if config.debug: | ||||
|         breakpoint() | ||||
|     lr = config.lr | ||||
|     batch_size = config.batch_size | ||||
|     num_workers = config.num_workers | ||||
|     device = config.device | ||||
| 
 | ||||
| 
 | ||||
| @cli.command() | ||||
| def train(): | ||||
|     batch_size = 16 | ||||
|     num_workers = 8 | ||||
| 
 | ||||
|     path = "fashion-mnist_train.csv" | ||||
|     trainset = FashionDataset(path=path) | ||||
|     path = Path(config.app_dir) / "storage/mnist_train.csv" | ||||
|     trainset = MnistDataset(path=path) | ||||
| 
 | ||||
|     trainloader = torch.utils.data.DataLoader( | ||||
|         trainset, batch_size=batch_size, shuffle=False, num_workers=num_workers | ||||
|         trainset, | ||||
|         batch_size=batch_size, | ||||
|         shuffle=False, | ||||
|         num_workers=num_workers, | ||||
|         # collate_fn=channel_to_batch, | ||||
|     ) | ||||
|     model = VGG11(in_channels=1, num_classes=10) | ||||
|     criterion = torch.nn.CrossEntropyLoss() | ||||
|     optimizer = torch.optim.Adam(model.parameters(), lr=2e-4) | ||||
|     optimizer = torch.optim.Adam(model.parameters(), lr=lr) | ||||
|     batch = Batch( | ||||
|         stage=Stage.TRAIN, | ||||
|         model=model, | ||||
|         device=torch.device("cpu"), | ||||
|         device=torch.device(device), | ||||
|         loader=trainloader, | ||||
|         criterion=criterion, | ||||
|         optimizer=optimizer, | ||||
|         config=config, | ||||
|     ) | ||||
|     batch.run( | ||||
|     log = batch.run( | ||||
|         "Run run run run. Run run run away. Oh Oh oH OHHHHHHH yayayayayayayayaya! - David Byrne" | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     cli() | ||||
|     train() | ||||
|  |  | |||
|  | @ -0,0 +1,10 @@ | |||
| from src.model.linear import DNN | ||||
| from src.data import GenericDataset | ||||
| import os | ||||
| 
 | ||||
| 
 | ||||
| def test_size_of_dataset(): | ||||
|     features = 40 | ||||
|     os.environ["INPUT_FEATURES"] = str(features) | ||||
|     dataset = GenericDataset() | ||||
|     assert len(dataset[0][0]) == features | ||||
		Loading…
	
		Reference in New Issue