diff --git a/README.md b/README.md index ff09510..2945da1 100644 --- a/README.md +++ b/README.md @@ -17,11 +17,14 @@ Pipenv: pipenv install -r requirements.txt pipenv shell ``` +Make sure h5py with version 2.10.0 in your env. Virtualenv is similar, but there's really no reason to use virtualenv instead of Pipenv anymore. ## Retraining the model +### the model in this repo was already retrained on oct. 2022. + There is a trained model checked into the `models` directory. If you'd like to train your own, you'll first need to download the training data from S3: ``` @@ -47,19 +50,67 @@ potentially run it for, say, 5 epochs and still get good accuracy with half the `python degas/runner train-model --epochs 5 data/processed ` -## Making predictions +## Making predictions (EDITED: specified version of tensorflow and tensorflow serving) Since this project uses Tensorflow as the underlying deep learning library, the recommended way to use this for inference is to use [Tensorflow Serving](https://www.tensorflow.org/serving/). You should be able to serve it using: + ``` -docker run -p 8501:8501 \ - --mount type=bind,source=models/degas,target=/models/degas\ - -e MODEL_NAME=degas -t tensorflow/serving +'docker run -p 8501:8501 \ + --mount type=bind,source=/Users/yourUserName/PycharmProjects/degas/models/degas,target=/models/degas\ + -e MODEL_NAME=degas -t tensorflow/serving:1.11.0' ``` See [Tensorflow Serving docs](https://www.tensorflow.org/serving/docker) for more information about available options. +show model info: +http://localhost:8501/v1/models/degas +model metadata: +http://localhost:8501/v1/models/degas/metadata +make a predict: +http://localhost:8501/v1/models/degas:predict +post json is : + +``` +{ + "instances": [[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +,0,0,0,0,0,0,0,0,0,0,0,0,0,37,37,37,12,21,29,29,21,26,19,12 +,17,29,27] +,[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +,0,0,0,0,0,0,0,0,0,0,0,15,2,38,4,3,36,8,9,19,33,0,1,12 +,17,29,27] +,[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +,0,0,0,0,0,0,0,0,0,0,0,0,37,37,37,12,34,37,23,34,34,19,32,12 +,17,29,27]] +} + +``` +It represents domains as bellow: +"www.google.com", "a2x43v89es01.com", "www.twitter.com" + +and the response will be: +``` +{ + "predictions": [ + [ + 4.54876e-11 + ], + [ + 0.723077 + ], + [ + 2.9277e-18 + ] + ] +} +``` + +so the result of predictions to "www.google.com", "www.a2x43v89es0-1.com", "www.twitter.com" is: false, ture, false. + # About Degas diff --git a/degas/model/predict.py b/degas/model/predict.py index 1f65874..9afb225 100644 --- a/degas/model/predict.py +++ b/degas/model/predict.py @@ -34,12 +34,16 @@ def load_model(version=1) -> Model: return model -def predict(model: Model, domains: np.ndarray) -> np.ndarray: +def predict(model: Model, domains: np.ndarray, batch_size: int = 32, steps = None, max_queue_size: int = 10, workers: int = 1) -> np.ndarray: """ Given a list of domains as input, returns a list of booleans, where True means it is predicted to be a DGA, and false means it is predicted to be benign """ - predictions = model.predict_on_batch(prep_data(domains)) + # predictions = model.predict(prep_data(domains), batch_size=batch_size,steps=steps, max_queue_size=max_queue_size, workers=workers) + + # parameter workers not present in tensorflow 1.11.0 + predictions = model.predict(prep_data(domains), batch_size=batch_size, steps=steps, max_queue_size=max_queue_size) + # predictions = model.predict_on_batch(prep_data(domains)) return predictions diff --git a/degas/model/train.py b/degas/model/train.py index e18d9e6..8e18be8 100644 --- a/degas/model/train.py +++ b/degas/model/train.py @@ -153,6 +153,7 @@ def run_kfold(data: pd.DataFrame, num_epochs=100, kfold_splits=2, batch_size=256 def main(input_filepath: str, epochs: int = 100, kfold_splits: int = 3) -> None: + os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3' logging.info("load up some data") input_path = Path(input_filepath) # if the input was a directory, add our default filename. diff --git a/environment.yml b/environment.yml index 8ac0137..7b2103e 100644 --- a/environment.yml +++ b/environment.yml @@ -11,6 +11,8 @@ dependencies: - seaborn - matplotlib - Click - - python-dotenv + - python-dotenv=0.20.0 - typing - - tensorflow=1.12.0 + - tensorflow=1.11.0 + - h5py=2.10.0 + diff --git a/models/degas/1/nyu_model.h5 b/models/degas/1/nyu_model.h5 index 070186e..e70694a 100644 Binary files a/models/degas/1/nyu_model.h5 and b/models/degas/1/nyu_model.h5 differ diff --git a/models/degas/1/saved_model.pb b/models/degas/1/saved_model.pb index 9448a96..ba2c085 100644 Binary files a/models/degas/1/saved_model.pb and b/models/degas/1/saved_model.pb differ diff --git a/models/degas/1/variables/variables.data-00000-of-00001 b/models/degas/1/variables/variables.data-00000-of-00001 index 0ac063a..27919b7 100644 Binary files a/models/degas/1/variables/variables.data-00000-of-00001 and b/models/degas/1/variables/variables.data-00000-of-00001 differ diff --git a/models/degas/1/variables/variables.index b/models/degas/1/variables/variables.index index c78c19e..19db87b 100644 Binary files a/models/degas/1/variables/variables.index and b/models/degas/1/variables/variables.index differ diff --git a/requirements.txt b/requirements.txt index 08f04c8..3d165f5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ -python>=3.6.7 -tensorflow=1.12.0 +python >= 3.6.7 +tensorflow == 1.11.0 pandas>=0.22 scikit-learn>=0.20 numpy -python-dotenv +python-dotenv==0.20.0 typing Click matplotlib diff --git a/tests/model/test_train.py b/tests/model/test_train.py index e69ed5d..15fab22 100644 --- a/tests/model/test_train.py +++ b/tests/model/test_train.py @@ -1,3 +1,5 @@ +import os + from ..context import degas from tensorflow.python.keras.models import Model @@ -6,6 +8,7 @@ def test_build_model(): + os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3' model: Model = degas.model.train.build_model() # this validates that we can build and compile it w/o error, which catches the most common issues in model creation print("Model: {}".format(model))