Added postgresql client package to Dockerfiles and psycopg2-binary to requirements.txt, removed unused matplotlib code/package from analysis/cluster.py and requirements.txt, replaced commands in docker start.sh file with new management commands.

This commit is contained in:
Dana Van Aken
2019-10-08 07:21:08 -04:00
parent e8f28ebef0
commit 0da1b724cc
9 changed files with 14 additions and 168 deletions

View File

@@ -3,11 +3,6 @@
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#
'''
Created on Jul 4, 2016
@author: dva
'''
from abc import ABCMeta, abstractproperty
from collections import OrderedDict
@@ -15,7 +10,6 @@ import os
import json
import copy
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from sklearn.metrics import silhouette_score
@@ -274,46 +268,6 @@ class KMeansClusters(ModelBase):
return self
def save(self, savedir):
"""Saves the KMeans model results
Parameters
----------
savedir : string
Path to the directory to save the results in.
"""
if self.cluster_map_ is None:
raise Exception("No models have been fitted yet!")
cluster_map = OrderedDict()
inertias = []
for K, model in sorted(self.cluster_map_.items()):
cluster_map[K] = {
"cluster_inertia": model.cluster_inertia_,
"cluster_labels": model.cluster_labels_,
"cluster_centers": model.cluster_centers_,
}
inertias.append(model.cluster_inertia_)
# Save sum of squares plot (elbow curve)
fig = plt.figure()
plt.plot(list(cluster_map.keys()), inertias, '--o')
plt.xlabel("Number of clusters (K)")
plt.ylabel("Within sum of squares W_k")
plt.title("Within Sum of Squares vs. Number of Clusters")
fig.canvas.set_window_title(os.path.basename(savedir))
savepath = os.path.join(savedir, "kmeans_sum_of_squares.pdf")
plt.savefig(savepath, bbox_inches="tight")
plt.close()
# save cluster memberships
for K in range(self.min_cluster_, self.max_cluster_ + 1):
savepath = os.path.join(savedir,
"memberships_{}-clusters.json".format(K))
members = self.cluster_map_[K].get_memberships()
with open(savepath, "w") as f:
f.write(members)
class KSelection(ModelBase, metaclass=ABCMeta):
"""KSelection:
@@ -529,38 +483,6 @@ class GapStatistic(KSelection):
for i in range(K)
for x in X[cluster_labels == i]])
def save(self, savedir):
"""Saves the estimation results of the optimal # of clusters.
Parameters
----------
savedir : string
Path to the directory to save the results in.
"""
super(GapStatistic, self).save(savedir)
# Plot the calculated gap
gaps = self.log_wkbs_ - self.log_wks_
fig = plt.figure()
plt.plot(self.clusters_, gaps, '--o')
plt.title("Gap vs. Number of Clusters")
plt.xlabel("Number of clusters (K)")
plt.ylabel("gap_K")
fig.canvas.set_window_title(os.path.basename(savedir))
plt.savefig(os.path.join(savedir, self.name_ + ".pdf"), bbox_inches="tight")
plt.close()
# Plot the gap statistic
fig = plt.figure()
plt.bar(self.clusters_, self.khats_)
plt.title("Gap Statistic vs. Number of Clusters")
plt.xlabel("Number of clusters (K)")
plt.ylabel("gap(K)-(gap(K+1)-s(K+1))")
fig.canvas.set_window_title(os.path.basename(savedir))
plt.savefig(os.path.join(savedir, self.name_ + "_final.pdf"),
bbox_inches="tight")
plt.close()
class DetK(KSelection):
"""DetK:
@@ -649,27 +571,6 @@ class DetK(KSelection):
self.fs_ = fs
return self
def save(self, savedir):
"""Saves the estimation results of the optimal # of clusters.
Parameters
----------
savedir : string
Path to the directory to save the results in.
"""
super(DetK, self).save(savedir)
# Plot the evaluation function
fig = plt.figure()
plt.plot(self.clusters_, self.fs_, '--o')
plt.xlabel("Number of clusters (K)")
plt.ylabel("Evaluation function (F_k)")
plt.title("Evaluation Function vs. Number of Clusters")
fig.canvas.set_window_title(os.path.basename(savedir))
savepath = os.path.join(savedir, self.name_ + "_eval_function.pdf")
plt.savefig(savepath, bbox_inches="tight")
plt.close()
class Silhouette(KSelection):
"""Det:
@@ -746,27 +647,6 @@ class Silhouette(KSelection):
self.scores_ = scores
return self
def save(self, savedir):
"""Saves the estimation results of the optimal # of clusters.
Parameters
----------
savedir : string
Path to the directory to save the results in.
"""
super(Silhouette, self).save(savedir)
# Plot the evaluation function
fig = plt.figure()
plt.plot(self.clusters_, self.scores_, '--o')
plt.xlabel("Number of clusters (K)")
plt.ylabel("Silhouette scores")
plt.title("Silhouette Scores vs. Number of Clusters")
fig.canvas.set_window_title(os.path.basename(savedir))
savepath = os.path.join(savedir, self.name_ + "_eval_function.pdf")
plt.savefig(savepath, bbox_inches="tight")
plt.close()
def create_kselection_model(model_name):
"""Constructs the KSelection model object with the given name

View File

@@ -6,20 +6,19 @@ django-debug-toolbar==1.5
django-db-logger>=0.1.7
django-request-logging==0.4.6
mock==2.0.0
Fabric3==1.13.1.post1
hurry.filesize==0.9
matplotlib==2.0.0
Fabric3>=1.13.1.post1
hurry.filesize>=0.9
numpy==1.13.1
requests==2.18.4
pycodestyle==2.3.1
astroid==1.5.1
psycopg2>=2.5.4
psycopg2-binary>=2.5.4
pylint==1.5.2
pyDOE==0.3.8
pyDOE>=0.3.8
mysqlclient==1.3.12
scikit-learn==0.19.1
scipy==1.0.0
tensorflow==1.10
threadpool==1.3.2
torch==1.2.0
torchvision==0.4.0
threadpool>=1.3.2
torch>=1.2.0
torchvision>=0.4.0