Created
May 11, 2019 21:05
-
-
Save apetenchea/c729f9a8a4606f8b4a8ecfce92a4b3a6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import os\n", | |
| "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n", | |
| "os.environ[\"CUDA_VISIBLE_DEVICES\"] = '0'\n", | |
| "\n", | |
| "import numpy as np\n", | |
| "import json\n", | |
| "import pandas as pd\n", | |
| "import tensorflow.keras as keras\n", | |
| "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n", | |
| "from tensorflow.keras.models import Model, load_model, Sequential\n", | |
| "from tensorflow.keras.layers import Dense, BatchNormalization, Input, Dropout, Activation\n", | |
| "from tensorflow.keras.models import load_model\n", | |
| "import tensorflow as tf # 1.11.0\n", | |
| "from sklearn.preprocessing import StandardScaler\n", | |
| "from sklearn.metrics import confusion_matrix\n", | |
| "import matplotlib.pyplot as plt\n", | |
| "import seaborn as sns\n", | |
| "import matplotlib\n", | |
| "matplotlib.rcParams['figure.figsize'] = (16, 9)\n", | |
| "sns.set(font_scale=1.5)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "config = tf.ConfigProto()\n", | |
| "config.gpu_options.allow_growth = True\n", | |
| "sess = tf.Session(config=config)\n", | |
| "tf.keras.backend.set_session(sess)\n", | |
| "\n", | |
| "%matplotlib inline" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "banned = [\n", | |
| "'WRITE_CALL_LOG',\n", | |
| "'WRITE_EXTERNAL_STORAGE',\n", | |
| "'READ_CALL_LOG',\n", | |
| "'READ_EXTERNAL_STORAGE',\n", | |
| "'READ_PHONE_STATE',\n", | |
| "'WRITE_SETTINGS',\n", | |
| "'GET_ACCOUNTS',\n", | |
| "'SYSTEM_ALERT_WINDOW',\n", | |
| "'READ_SETTINGS',\n", | |
| "'PERMISSIONS',\n", | |
| "]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Load data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "dataset = list()\n", | |
| "with open('dataset281118.dat') as f:\n", | |
| " for line in f:\n", | |
| " dataset.append(json.loads(line.strip()))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "features = json.load(open('features.json'))['config']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "d = list()\n", | |
| "for i in dataset:\n", | |
| " s = dict()\n", | |
| " for f in features:\n", | |
| " value = i['features'][f['name']]\n", | |
| " if f['clipUpper'] is not None:\n", | |
| " value = min(value, f['clipUpper'])\n", | |
| " if f['clipLower'] is not None:\n", | |
| " value = max(value, f['clipLower'])\n", | |
| " s[f['name']] = value\n", | |
| " s['verdict'] = 0 if i['verdict'] == 'clean' else 1\n", | |
| " d.append(s)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df = pd.DataFrame(d, columns=d[0].keys())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "X = df.drop('verdict', axis='columns')\n", | |
| "Y = df.verdict" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "X_train, Y_train = X[:800000].values.astype(np.float32), Y[:800000].values\n", | |
| "X_valid, Y_valid = X[800000:900000].values.astype(np.float32), Y[800000:900000].values\n", | |
| "X_test, Y_test = X[900000:].values.astype(np.float32), Y[900000:].values" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "X_train_scaled = X_train.copy()\n", | |
| "X_valid_scaled = X_valid.copy()\n", | |
| "X_test_scaled = X_test.copy()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "for idx in range(len(features)):\n", | |
| " if features[idx]['type'] in ('number', 'count'):\n", | |
| " scaler = StandardScaler()\n", | |
| " scaler.fit(X_train[:, idx:idx+1])\n", | |
| " scaler.mean_ = np.float32(scaler.mean_)\n", | |
| " scaler.scale_ = np.float32(scaler.scale_)\n", | |
| " features[idx]['mean'] = float(scaler.mean_[0])\n", | |
| " features[idx]['scale'] = float(scaler.scale_[0])\n", | |
| " X_train_scaled[:, idx:idx+1] = scaler.transform(X_train_scaled[:, idx:idx+1])\n", | |
| " X_valid_scaled[:, idx:idx+1] = scaler.transform(X_valid_scaled[:, idx:idx+1])\n", | |
| " X_test_scaled[:, idx:idx+1] = scaler.transform(X_test_scaled[:, idx:idx+1])\n", | |
| " else:\n", | |
| " features[idx]['mean'] = None\n", | |
| " features[idx]['scale'] = None" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "conf = dict(config=features)\n", | |
| "with open('features.json', 'w') as f:\n", | |
| " json.dump(conf, f, separators=(',', ':'))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Train" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def train():\n", | |
| " nn = Sequential([\n", | |
| " Dense(256, input_dim=len(features)),\n", | |
| " Activation('relu'),\n", | |
| " BatchNormalization(),\n", | |
| " Dropout(0.2),\n", | |
| " Dense(256),\n", | |
| " Activation('relu'),\n", | |
| " BatchNormalization(),\n", | |
| " Dropout(0.5),\n", | |
| " Dense(128),\n", | |
| " Activation('relu'),\n", | |
| " BatchNormalization(),\n", | |
| " Dropout(0.5),\n", | |
| " Dense(64),\n", | |
| " Activation('relu'),\n", | |
| " BatchNormalization(),\n", | |
| " Dense(64),\n", | |
| " Activation('relu'),\n", | |
| " BatchNormalization(),\n", | |
| " Dense(1),\n", | |
| " Activation('sigmoid')\n", | |
| " ])\n", | |
| " nn.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy'], weighted_metrics=['accuracy'])\n", | |
| " callbacks = list()\n", | |
| " callbacks.append(EarlyStopping(monitor='val_loss', min_delta=0, patience=3))\n", | |
| " callbacks.append(ModelCheckpoint('earl_best.h5', monitor='val_acc', save_weights_only=False, save_best_only=True))\n", | |
| "\n", | |
| " nn.fit(X_train_scaled, Y_train, epochs=100, batch_size=32, validation_data=(X_valid_scaled, Y_valid), callbacks=callbacks)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "train()\n", | |
| "nn = load_model('earl_best.h5')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "nn.evaluate(X_valid_scaled, Y_valid)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "nn.evaluate(X_train_scaled, Y_train)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "nn.evaluate(X_test_scaled, Y_test)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "label = ['clean', 'trojan']\n", | |
| "matrix = confusion_matrix(Y_test, np.round(nn.predict(X_test_scaled).flatten()))\n", | |
| "percent = np.zeros((2,2))\n", | |
| "for i in range(2):\n", | |
| " s = sum(matrix[i])\n", | |
| " for j in range(2):\n", | |
| " percent[i][j] = matrix[i][j].item() * 100.0 / s\n", | |
| "cm = pd.DataFrame(percent, columns=label, index=label)\n", | |
| "sns.heatmap(cm, annot=True, fmt='.2f')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from sklearn.metrics import roc_auc_score, roc_curve\n", | |
| "auc = roc_auc_score(Y_test, np.round(nn.predict(X_test_scaled).flatten()))\n", | |
| "fpr, tpr, thresholds = roc_curve(Y_test, np.round(nn.predict(X_test_scaled).flatten()))\n", | |
| "plt.plot([0, 1], [0, 1], linestyle='--')\n", | |
| "plt.plot(fpr, tpr, marker='.')\n", | |
| "plt.title(auc)\n", | |
| "plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Convert" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "converter = tf.contrib.lite.TocoConverter.from_keras_model_file('earl_best.h5')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "tflite_model = converter.convert()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "open('earl_model.tflite', \"wb\").write(tflite_model)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Test" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "model = tf.contrib.lite.Interpreter(f'earl_model.tflite')\n", | |
| "model.allocate_tensors()\n", | |
| "model_in = model.get_input_details()[0]\n", | |
| "model_out = model.get_output_details()[0]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def predict(model, i, o, x):\n", | |
| " model.set_tensor(i['index'], np.float32(x.reshape((1, len(features)))))\n", | |
| " model.invoke()\n", | |
| " y = model.get_tensor(o['index'])\n", | |
| " return y" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "model.get_input_details()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "scrolled": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "x = open('dataset281118.dat')\n", | |
| "q = []\n", | |
| "for i in range(4):\n", | |
| " s = json.loads(x.readline().strip())\n", | |
| " l = []\n", | |
| " for j in X_train_scaled[i]:\n", | |
| " l.append(float(j))\n", | |
| " d = dict(expected=float(predict(model, model_in, model_out, X_train_scaled[i])[0][0]), md5=s['md5'], verdict=float(Y_train[i]), features=l)\n", | |
| " q.append(d)\n", | |
| "with open('test.json', 'w') as f:\n", | |
| " json.dump(q, f)\n", | |
| "x.close()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "idx = 1" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "X_train[idx].astype(int)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "X_train_scaled[idx]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "predict(model, model_in, model_out, X_train_scaled[idx])" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.6.6" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment