remove conda. add requirements.txt. add tests
This commit is contained in:
parent
355e83843f
commit
44250fc618
|
@ -1,4 +1,163 @@
|
||||||
storage/
|
data/
|
||||||
|
|
||||||
|
outputs
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.swp
|
*.py[cod]
|
||||||
*.tmp
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
#pdm.lock
|
||||||
|
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||||
|
# in version control.
|
||||||
|
# https://pdm.fming.dev/#use-with-ide
|
||||||
|
.pdm.toml
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
#.idea/
|
||||||
|
|
29
Makefile
29
Makefile
|
@ -1,15 +1,30 @@
|
||||||
CONDA_ENV=ml_pipeline
|
PYTHON=.venv/bin/python3
|
||||||
|
.PHONY: help test
|
||||||
|
|
||||||
all: run
|
all: run
|
||||||
|
|
||||||
run:
|
init:
|
||||||
python src/pipeline.py train
|
python3.9 -m virtualenv .venv
|
||||||
|
|
||||||
data:
|
run: ## run the pipeline (train)
|
||||||
python src/data.py
|
$(PYTHON) src/train.py \
|
||||||
|
debug=false
|
||||||
|
|
||||||
batch:
|
debug: ## run the pipeline (train) with debugging enabled
|
||||||
python src/batch.py
|
$(PYTHON) src/train.py \
|
||||||
|
debug=true
|
||||||
|
|
||||||
|
data: ## download the mnist data
|
||||||
|
wget https://pjreddie.com/media/files/mnist_train.csv -O data/mnist_train.csv
|
||||||
|
wget https://pjreddie.com/media/files/mnist_test.csv -O data/mnist_test.csv
|
||||||
|
test:
|
||||||
|
find . -iname "*.py" | entr -c pytest
|
||||||
|
|
||||||
|
install:
|
||||||
|
$(PYTHON) -m pip install -r requirements.txt
|
||||||
|
|
||||||
|
help: ## display this help message
|
||||||
|
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
|
||||||
|
|
||||||
install:
|
install:
|
||||||
conda env updates -n ${CONDA_ENV} --file environment.yml
|
conda env updates -n ${CONDA_ENV} --file environment.yml
|
||||||
|
|
|
@ -0,0 +1,12 @@
|
||||||
|
black
|
||||||
|
click
|
||||||
|
einops
|
||||||
|
hydra-core
|
||||||
|
matplotlib
|
||||||
|
numpy
|
||||||
|
wandb
|
||||||
|
pytest
|
||||||
|
python-dotenv
|
||||||
|
torch
|
||||||
|
requests
|
||||||
|
tqdm
|
|
@ -0,0 +1,66 @@
|
||||||
|
from torch.utils.data import Dataset
|
||||||
|
import numpy as np
|
||||||
|
import einops
|
||||||
|
import csv
|
||||||
|
import torch
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Tuple
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class MnistDataset(Dataset):
|
||||||
|
"""
|
||||||
|
The MNIST database of handwritten digits.
|
||||||
|
Training set is 60k labeled examples, test is 10k examples.
|
||||||
|
The b/w images normalized to 20x20, preserving aspect ratio.
|
||||||
|
|
||||||
|
It's the defacto standard image training set to learn about classification in DL
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, path: Path):
|
||||||
|
"""
|
||||||
|
give a path to a dir that contains the following csv files:
|
||||||
|
https://pjreddie.com/projects/mnist-in-csv/
|
||||||
|
"""
|
||||||
|
self.path = path
|
||||||
|
self.features, self.labels = self._load()
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
return (self.features[idx], self.labels[idx])
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.features)
|
||||||
|
|
||||||
|
def _load(self) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
# opening the CSV file
|
||||||
|
with open(self.path, mode="r") as file:
|
||||||
|
images, labels = [], []
|
||||||
|
csvFile = csv.reader(file)
|
||||||
|
examples = int(os.getenv("TRAINING_EXAMPLES", 1000))
|
||||||
|
for line, content in enumerate(csvFile):
|
||||||
|
if line == examples:
|
||||||
|
break
|
||||||
|
labels.append(int(content[0]))
|
||||||
|
image = [int(x) for x in content[1:]]
|
||||||
|
images.append(image)
|
||||||
|
labels = torch.tensor(labels, dtype=torch.long)
|
||||||
|
images = torch.tensor(images, dtype=torch.float32)
|
||||||
|
images = einops.rearrange(images, "n (w h) -> n w h", w=28, h=28)
|
||||||
|
images = einops.repeat(
|
||||||
|
images, "n w h -> n c (w r_w) (h r_h)", c=1, r_w=8, r_h=8
|
||||||
|
)
|
||||||
|
return (images, labels)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
path = Path("storage/mnist_train.csv")
|
||||||
|
dataset = MnistDataset(path=path)
|
||||||
|
print(f"len: {len(dataset)}")
|
||||||
|
print(f"first shape: {dataset[0][0].shape}")
|
||||||
|
mean = einops.reduce(dataset[:10][0], "n w h -> w h", "mean")
|
||||||
|
print(f"mean shape: {mean.shape}")
|
||||||
|
print(f"mean image: {mean}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,21 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
from sys import stdout
|
||||||
|
import csv
|
||||||
|
|
||||||
|
# 'pip install pyspark' for these
|
||||||
|
from pyspark import SparkFiles
|
||||||
|
from pyspark.sql import SparkSession
|
||||||
|
|
||||||
|
# make a spark "session". this creates a local hadoop cluster by default (!)
|
||||||
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
# put the input file in the cluster's filesystem:
|
||||||
|
spark.sparkContext.addFile("https://csvbase.com/meripaterson/stock-exchanges.csv")
|
||||||
|
# the following is much like for pandas
|
||||||
|
df = (
|
||||||
|
spark.read.csv(f"file://{SparkFiles.get('stock-exchanges.csv')}", header=True)
|
||||||
|
.select("MIC")
|
||||||
|
.na.drop()
|
||||||
|
.sort("MIC")
|
||||||
|
)
|
||||||
|
# pyspark has no easy way to write csv to stdout - use python's csv lib
|
||||||
|
csv.writer(stdout).writerows(df.collect())
|
|
@ -37,10 +37,10 @@ class VGG11(nn.Module):
|
||||||
self.linear_layers = nn.Sequential(
|
self.linear_layers = nn.Sequential(
|
||||||
nn.Linear(in_features=512 * 7 * 7, out_features=4096),
|
nn.Linear(in_features=512 * 7 * 7, out_features=4096),
|
||||||
nn.ReLU(),
|
nn.ReLU(),
|
||||||
nn.Dropout2d(0.5),
|
nn.Dropout(0.5),
|
||||||
nn.Linear(in_features=4096, out_features=4096),
|
nn.Linear(in_features=4096, out_features=4096),
|
||||||
nn.ReLU(),
|
nn.ReLU(),
|
||||||
nn.Dropout2d(0.5),
|
nn.Dropout(0.5),
|
||||||
nn.Linear(in_features=4096, out_features=self.num_classes),
|
nn.Linear(in_features=4096, out_features=self.num_classes),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
TRAIN_PATH=${HOME}/Dev/ml/data/mnist_train.csv
|
||||||
|
INPUT_FEATURES=40
|
|
@ -0,0 +1,20 @@
|
||||||
|
# conftest.py
|
||||||
|
import pytest
|
||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def load_env():
|
||||||
|
# Set up your environment variables here
|
||||||
|
env = Path(__file__).parent / ".env.test"
|
||||||
|
if not load_dotenv(env):
|
||||||
|
raise RuntimeError(".env not loaded")
|
||||||
|
# os.environ['MY_ENV_VAR'] = 'some_value'
|
||||||
|
# You can add more setup code here if needed
|
||||||
|
|
||||||
|
yield
|
||||||
|
|
||||||
|
# Optional: Cleanup code after test (if needed)
|
||||||
|
# e.g., unset environment variables if they should not persist after test
|
|
@ -0,0 +1,17 @@
|
||||||
|
from src.model.linear import DNN
|
||||||
|
from src.data.dataset import MnistDataset
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def test_size_of_dataset():
|
||||||
|
examples = 500
|
||||||
|
os.environ["TRAINING_EXAMPLES"] = str(examples)
|
||||||
|
channels = 1
|
||||||
|
width, height = 224, 224
|
||||||
|
dataset = MnistDataset(os.getenv("TRAIN_PATH"))
|
||||||
|
# label = dataset[0][1].item()
|
||||||
|
image = dataset[0][0].shape
|
||||||
|
assert channels == image[0]
|
||||||
|
assert width == image[1]
|
||||||
|
assert height == image[2]
|
||||||
|
assert len(dataset) == examples
|
Loading…
Reference in New Issue