Created
March 3, 2018 14:41
-
-
Save mixuala/c94c5da6e056010e552df3b052f701fa to your computer and use it in GitHub Desktop.
HOWTO train and validate sequentially with `ts.slim`
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "name": "HOWTO: tf.slim train+validate sequential.ipynb", | |
| "version": "0.3.2", | |
| "views": {}, | |
| "default_view": {}, | |
| "provenance": [ | |
| { | |
| "file_id": "11PWvXR85NAIe6LAV1kocheXGDJa1VOa1", | |
| "timestamp": 1519981481371 | |
| } | |
| ], | |
| "collapsed_sections": [], | |
| "toc_visible": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "accelerator": "GPU" | |
| }, | |
| "cells": [ | |
| { | |
| "metadata": { | |
| "id": "p2_4yjzakP8s", | |
| "colab_type": "text" | |
| }, | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# HowTo: running `train` & `validation` loops sequentially in the same session with `tf-slim` \n", | |
| "\n", | |
| "`Validation` loops are used to monitor `training` to identify when models reach a high variance state, e.g. `overfitting`. \n", | |
| "\n", | |
| "The purpose of this notebook is to provide an example of how to run a `validation` loop **for an entire epooch** in the same session as a `train` loop using `tf.slim`. We'll also provide an example on how to show `validation` summaries on `tensorboard`.\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "## references:\n", | |
| "\n", | |
| "* This example borrows from the [slim walkthough](https://github.com/tensorflow/models/blob/master/research/slim/slim_walkthrough.ipynb) notebook \n", | |
| "* recipe for [running train/validation/test loops](https://github.com/tensorflow/tensorflow/issues/5987) in the same `tf.slim` session\n", | |
| "* plot [training and validation losses](https://stackoverflow.com/questions/37146614/tensorboard-plot-training-and-validation-losses-on-the-same-graph) on the same tensorboard graph\n" | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "id": "qZ6P8ndskPJq", | |
| "colab_type": "text" | |
| }, | |
| "cell_type": "markdown", | |
| "source": [ | |
| "\n", | |
| "\n", | |
| "# Simple Example\n", | |
| "\n", | |
| "## Setup" | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "id": "gQMK5A4zpVm7", | |
| "colab_type": "code", | |
| "colab": { | |
| "autoexec": { | |
| "startup": false, | |
| "wait_interval": 0 | |
| }, | |
| "output_extras": [ | |
| { | |
| "item_id": 28 | |
| } | |
| ], | |
| "base_uri": "https://localhost:8080/", | |
| "height": 221 | |
| }, | |
| "outputId": "0600cd83-4d3c-4f9d-8a51-073c1fecc588", | |
| "executionInfo": { | |
| "status": "ok", | |
| "timestamp": 1520087137927, | |
| "user_tz": -480, | |
| "elapsed": 14526, | |
| "user": { | |
| "displayName": "michael lin", | |
| "photoUrl": "//lh3.googleusercontent.com/-etfWG7MvQwk/AAAAAAAAAAI/AAAAAAAAADM/BxW0OLTdkjI/s50-c-k-no/photo.jpg", | |
| "userId": "111539764795298113840" | |
| } | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "import os\n", | |
| "# load repo for TF-Slim image models\n", | |
| "found = os.path.isdir('/content/models')\n", | |
| "if not found:\n", | |
| " !git clone https://github.com/tensorflow/models.git\n", | |
| " !git clone https://github.com/mixuala/colab_utils.git" | |
| ], | |
| "execution_count": 1, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Cloning into 'models'...\n", | |
| "remote: Counting objects: 12755, done.\u001b[K\n", | |
| "remote: Total 12755 (delta 2), reused 2 (delta 2), pack-reused 12752\u001b[K\n", | |
| "Receiving objects: 100% (12755/12755), 412.30 MiB | 44.06 MiB/s, done.\n", | |
| "Resolving deltas: 100% (7188/7188), done.\n", | |
| "Checking out files: 100% (1794/1794), done.\n", | |
| "Cloning into 'colab_utils'...\n", | |
| "remote: Counting objects: 198, done.\u001b[K\n", | |
| "remote: Compressing objects: 100% (21/21), done.\u001b[K\n", | |
| "remote: Total 198 (delta 9), reused 22 (delta 6), pack-reused 171\u001b[K\n", | |
| "Receiving objects: 100% (198/198), 56.79 KiB | 7.10 MiB/s, done.\n", | |
| "Resolving deltas: 100% (77/77), done.\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "id": "Kvag4TvZoy2S", | |
| "colab_type": "code", | |
| "colab": { | |
| "autoexec": { | |
| "startup": false, | |
| "wait_interval": 0 | |
| } | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "from __future__ import absolute_import\n", | |
| "from __future__ import division\n", | |
| "from __future__ import print_function\n", | |
| "\n", | |
| "import os, sys, shutil\n", | |
| "\n", | |
| "import matplotlib\n", | |
| "%matplotlib inline\n", | |
| "import matplotlib.pyplot as plt\n", | |
| "import math\n", | |
| "import numpy as np\n", | |
| "import tensorflow as tf\n", | |
| "import time\n", | |
| "\n", | |
| "from models.research.slim.datasets import dataset_utils\n", | |
| "from colab_utils import tboard\n", | |
| "\n", | |
| "# Main slim library\n", | |
| "from tensorflow.contrib import slim" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "id": "Wy-Nm6-sruSE", | |
| "colab_type": "text" | |
| }, | |
| "cell_type": "markdown", | |
| "source": [ | |
| "## Build `tf.slim` training loop" | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "id": "aQGSBeHUsQoU", | |
| "colab_type": "code", | |
| "colab": { | |
| "autoexec": { | |
| "startup": false, | |
| "wait_interval": 0 | |
| } | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "### \n", | |
| "### helpers\n", | |
| "###\n", | |
| "\n", | |
| "def reset_tensorboard(log_dir):\n", | |
| " try:\n", | |
| " shutil.rmtree(log_dir)\n", | |
| " except:\n", | |
| " pass\n", | |
| " if not tf.gfile.Exists(log_dir): tf.gfile.MakeDirs(log_dir) \n", | |
| " not_empty = %ls $log_dir\n", | |
| " if not not_empty: print(\"TRAIN_DIR={}, ls:{}\".format(log_dir, not_empty))\n", | |
| " \n", | |
| "def get_checkpoint_step(dir):\n", | |
| " \"\"\"get global_step from checkpoint_path outside of graph\"\"\"\n", | |
| " import re\n", | |
| " path = tf.train.latest_checkpoint(dir)\n", | |
| " if not path:\n", | |
| " return 0\n", | |
| " found = re.search(\"(\\d+)$\", path)\n", | |
| " return int(found[0]) if found else 0 " | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "id": "z0iO_a9no91B", | |
| "colab_type": "code", | |
| "colab": { | |
| "autoexec": { | |
| "startup": false, | |
| "wait_interval": 0 | |
| } | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "### \n", | |
| "### net\n", | |
| "###\n", | |
| "def regression_model(inputs, is_training=True, scope=\"regression_model\"):\n", | |
| " \"\"\"Creates the regression model.\n", | |
| "\n", | |
| " Args:\n", | |
| " inputs: A node that yields a `Tensor` of size [batch_size, dimensions].\n", | |
| " is_training: Whether or not we're currently training the model.\n", | |
| " scope: An optional variable_op scope for the model.\n", | |
| "\n", | |
| " Returns:\n", | |
| " predictions: 1-D `Tensor` of shape [batch_size] of responses.\n", | |
| " end_points: A dict of end points representing the hidden layers.\n", | |
| " \"\"\"\n", | |
| " # Make the model, reuse weights for validation batches using reuse=tf.AUTO_REUSE\n", | |
| " with slim.arg_scope([slim.fully_connected], reuse=tf.AUTO_REUSE):\n", | |
| " end_points = {}\n", | |
| " # Set the default weight _regularizer and acvitation for each fully_connected layer.\n", | |
| " with slim.arg_scope([slim.fully_connected],\n", | |
| " activation_fn=tf.nn.relu,\n", | |
| " weights_regularizer=slim.l2_regularizer(0.01)):\n", | |
| "\n", | |
| " # Creates a fully connected layer from the inputs with 32 hidden units.\n", | |
| " net = slim.fully_connected(inputs, 32, scope='fc1')\n", | |
| " end_points['fc1'] = net\n", | |
| "\n", | |
| " # Adds a dropout layer to prevent over-fitting.\n", | |
| " net = slim.dropout(net, 0.8, is_training=is_training)\n", | |
| "\n", | |
| " # Adds another fully connected layer with 16 hidden units.\n", | |
| " net = slim.fully_connected(net, 16, scope='fc2')\n", | |
| " end_points['fc2'] = net\n", | |
| "\n", | |
| " # Creates a fully-connected layer with a single hidden unit. Note that the\n", | |
| " # layer is made linear by setting activation_fn=None.\n", | |
| " predictions = slim.fully_connected(net, 1, activation_fn=None, scope='prediction')\n", | |
| " end_points['out'] = predictions\n", | |
| "\n", | |
| " return predictions, end_points\n" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "id": "CmQmGn2qMpbk", | |
| "colab_type": "text" | |
| }, | |
| "cell_type": "markdown", | |
| "source": [ | |
| "## Using `TFRecords` with `slim.dataset.Dataset`\n", | |
| "\n", | |
| "see: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim/python/slim/data/\n", | |
| "\n", | |
| "reading data: \n", | |
| "- https://www.tensorflow.org/api_guides/python/reading_data\n", | |
| "- https://github.com/kwotsin/transfer_learning_tutorial/blob/master/train_flowers.py" | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "id": "B_1zitZeHiVZ", | |
| "colab_type": "text" | |
| }, | |
| "cell_type": "markdown", | |
| "source": [ | |
| "### create `TFRecord` files" | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "id": "Qwp7-exyGn-X", | |
| "colab_type": "code", | |
| "colab": { | |
| "autoexec": { | |
| "startup": false, | |
| "wait_interval": 0 | |
| } | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "### TFRecord helpers\n", | |
| "DATA_DIR = \"/tmp/tfrecords\"\n", | |
| "TRAIN_SAMPLES = 320\n", | |
| "VALIDATE_SAMPLES = 64\n", | |
| "\n", | |
| "\n", | |
| "def _count_tfrecord_samples(tfrecord_list):\n", | |
| " \"\"\"Count the total number of examples in list() of tfrecord file paths\"\"\"\n", | |
| " num_samples = 0\n", | |
| " for tfrecord_file in tfrecord_list:\n", | |
| " # print(tfrecord_file)\n", | |
| " for record in tf.python_io.tf_record_iterator(tfrecord_file):\n", | |
| " num_samples += 1\n", | |
| " return num_samples\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "### create TFRecord files\n", | |
| "def produce_batch(batch_size, noise=0.3):\n", | |
| " \"\"\"### create raw data as numpy arrays\"\"\"\n", | |
| " xs = np.random.random(size=[batch_size, 1]) * 10\n", | |
| " ys = np.sin(xs) + 5 + np.random.normal(size=[batch_size, 1], scale=noise)\n", | |
| " return [xs.astype(np.float32), ys.astype(np.float32)]\n", | |
| "\n", | |
| "def convert_data_to_tensors(x, y):\n", | |
| " inputs = tf.constant(x)\n", | |
| " inputs.set_shape([None, 1])\n", | |
| " outputs = tf.constant(y)\n", | |
| " outputs.set_shape([None, 1])\n", | |
| " return inputs, outputs\n", | |
| " \n", | |
| "\n", | |
| "if not tf.gfile.Exists(DATA_DIR): tf.gfile.MakeDirs(DATA_DIR) \n", | |
| "def np_to_tfrecord(inputs, labels, filename, data_dir): \n", | |
| " \"\"\" create `TFRecord` files from numpy raw data\"\"\"\n", | |
| " writer = tf.python_io.TFRecordWriter( \"{}/{}.tfrecord\".format(data_dir, filename))\n", | |
| " for m in range(len(inputs)):\n", | |
| " # Feature contains a map of string for each feature proto objects\n", | |
| " # NOTE: for tf.train.FloatList|Int64List value must be a tuple or list!!\n", | |
| " feature = {\n", | |
| " \"input\": tf.train.Feature(float_list=tf.train.FloatList(value= inputs[m].tolist() )),\n", | |
| " \"label\": tf.train.Feature(float_list=tf.train.FloatList(value= labels[m].tolist() )),\n", | |
| " }\n", | |
| "\n", | |
| " # Construct the Example proto object\n", | |
| " example = tf.train.Example(features=tf.train.Features(feature=feature))\n", | |
| " # Serialize the example to a string, write records to a tfrecords file\n", | |
| "\n", | |
| " writer.write(example.SerializeToString())\n", | |
| " writer.close()\n", | |
| "\n", | |
| "\n", | |
| "def create_raw_data(shards=4):\n", | |
| " \"\"\"create sharded \".tfrecord\" files\"\"\" \n", | |
| " for i in range(shards):\n", | |
| " x, y = produce_batch(TRAIN_SAMPLES) # x.shape=[320,1], y.shape=[320,1]\n", | |
| " np_to_tfrecord(x, y, \"train_{}\".format(i), DATA_DIR)\n", | |
| " x, y = produce_batch(VALIDATE_SAMPLES) # x.shape=[320,1], y.shape=[320,1]\n", | |
| " np_to_tfrecord(x, y, \"validation_{}\".format(i), DATA_DIR)\n", | |
| "\n", | |
| "\n", | |
| " \n", | |
| "found = [f for f in os.listdir(DATA_DIR) if f.endswith(\".tfrecord\")]\n", | |
| "if found:\n", | |
| " print(\"using TFRecords found in path=\",DATA_DIR)\n", | |
| "else:\n", | |
| " create_raw_data() " | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "id": "BrltnNMwr_TE", | |
| "colab_type": "text" | |
| }, | |
| "cell_type": "markdown", | |
| "source": [ | |
| "### get `slim.dataset.Dataset` from `TFRecords`" | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "id": "jVtdowyITXYf", | |
| "colab_type": "code", | |
| "colab": { | |
| "autoexec": { | |
| "startup": false, | |
| "wait_interval": 0 | |
| } | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "### get slim.dataset.Dataset from `TFRecords` for graph\n", | |
| "def get_slim_Dataset(filter=\"none\"):\n", | |
| " \"\"\"usage:\n", | |
| " train_dataset = get_slim_Dataset(\"train\")\n", | |
| " validation_dataset = get_slim_Dataset(\"validation\")\n", | |
| "\n", | |
| " Return:\n", | |
| " slim.dataset.Dataset\n", | |
| " \"\"\"\n", | |
| "\n", | |
| " \n", | |
| " keys_to_features = {\n", | |
| " 'input': tf.FixedLenFeature( [], tf.float32),\n", | |
| " 'label': tf.FixedLenFeature( [], tf.float32),\n", | |
| " }\n", | |
| "\n", | |
| " items_to_handlers = {\n", | |
| " 'input': slim.tfexample_decoder.Tensor('input'),\n", | |
| " 'label': slim.tfexample_decoder.Tensor('label'),\n", | |
| " }\n", | |
| "\n", | |
| " decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers)\n", | |
| "\n", | |
| " sources = [\"{}/{}\".format(DATA_DIR, f) for f in os.listdir(DATA_DIR) if filter in f]\n", | |
| " # sources = \"{}_*.tfrecord\".format(\"train\")\n", | |
| " dataset = slim.dataset.Dataset(\n", | |
| " data_sources=sources, \n", | |
| " reader=tf.TFRecordReader,\n", | |
| " decoder=decoder,\n", | |
| " num_samples=_count_tfrecord_samples(sources),\n", | |
| " items_to_descriptions={})\n", | |
| " return dataset\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "### get a `tf.train.batch()` from `TFRecords` using the Dataset API\n", | |
| "def load_batch_from_slim_dataset(dataset, batch_size=32):\n", | |
| " \"\"\"usage:\n", | |
| " with tf.Graph().as_default():\n", | |
| " x_train, y_train = load_batch_from_slim_dataset(train_dataset)\n", | |
| " x_validation, y_validation = load_batch_from_slim_dataset(validation_dataset)\n", | |
| " Return:\n", | |
| " [inputs, labels] as tensors\n", | |
| " \"\"\"\n", | |
| " data_provider = slim.dataset_data_provider.DatasetDataProvider(\n", | |
| " dataset, common_queue_capacity=32,\n", | |
| " common_queue_min=8)\n", | |
| " input, label = data_provider.get(['input', 'label'])\n", | |
| " \n", | |
| " # Batch it up.\n", | |
| " inputs, labels = tf.train.batch(\n", | |
| " [input, label],\n", | |
| " batch_size=batch_size,\n", | |
| " num_threads=1,\n", | |
| " capacity=2 * batch_size)\n", | |
| " inputs = tf.reshape(inputs, [batch_size,1])\n", | |
| " labels = tf.reshape(labels, [batch_size,1])\n", | |
| " return [inputs, labels]\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "## load the datasets for \"train\" and \"validation\"\n", | |
| "train_dataset = get_slim_Dataset(\"train\")\n", | |
| "validation_dataset = get_slim_Dataset(\"validation\")" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "id": "HYvY5RYNtGRL", | |
| "colab_type": "text" | |
| }, | |
| "cell_type": "markdown", | |
| "source": [ | |
| "## Build graph for training loop" | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "id": "_fDTDgqEswde", | |
| "colab_type": "code", | |
| "colab": { | |
| "autoexec": { | |
| "startup": false, | |
| "wait_interval": 0 | |
| }, | |
| "output_extras": [ | |
| { | |
| "item_id": 3 | |
| }, | |
| { | |
| "item_id": 4 | |
| } | |
| ], | |
| "base_uri": "https://localhost:8080/", | |
| "height": 119 | |
| }, | |
| "outputId": "99d41de6-c8ec-48e6-d2ab-bd3d3a00db83", | |
| "executionInfo": { | |
| "status": "ok", | |
| "timestamp": 1520087187785, | |
| "user_tz": -480, | |
| "elapsed": 4289, | |
| "user": { | |
| "displayName": "michael lin", | |
| "photoUrl": "//lh3.googleusercontent.com/-etfWG7MvQwk/AAAAAAAAAAI/AAAAAAAAADM/BxW0OLTdkjI/s50-c-k-no/photo.jpg", | |
| "userId": "111539764795298113840" | |
| } | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "### \n", | |
| "### runtime params\n", | |
| "###\n", | |
| "LOG_DIR = '/tmp/tensorboard'\n", | |
| "TRAIN_DIR = LOG_DIR + \"/train\"\n", | |
| "\n", | |
| "BATCH_SIZE = 32\n", | |
| "LOG_INTERVAL = 100\n", | |
| "\n", | |
| "tboard.launch_tensorboard(bin_dir=\"/tmp\", log_dir=LOG_DIR)\n" | |
| ], | |
| "execution_count": 8, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "calling wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip ...\n", | |
| "calling unzip ngrok-stable-linux-amd64.zip ...\n", | |
| "ngrok installed. path=/tmp/ngrok\n", | |
| "status: tensorboard=False, ngrok=False\n", | |
| "tensorboard url= http://ced5fc3a.ngrok.io\n" | |
| ], | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "'http://ced5fc3a.ngrok.io'" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 8 | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "id": "noNMvWRAJvHH", | |
| "colab_type": "code", | |
| "colab": { | |
| "autoexec": { | |
| "startup": false, | |
| "wait_interval": 0 | |
| } | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "\n", | |
| "def train_one_loop(label=\"simple\", epochs=1):\n", | |
| "\n", | |
| " ## derived values\n", | |
| " dataset = train_dataset\n", | |
| " is_training = True\n", | |
| " log_dir = os.path.join(LOG_DIR, label)\n", | |
| "\n", | |
| " steps = get_checkpoint_step(log_dir)\n", | |
| " steps += epochs * int(dataset.num_samples / BATCH_SIZE)\n", | |
| " \n", | |
| " tf.logging.set_verbosity(tf.logging.INFO)\n", | |
| "\n", | |
| " with tf.Graph().as_default():\n", | |
| " global_step = tf.train.get_or_create_global_step()\n", | |
| "\n", | |
| " # adjust steps for restored global_step value\n", | |
| "\n", | |
| "\n", | |
| " ### get inputs, labels from slim.data.dataset.Dataset()\n", | |
| " inputs, labels = load_batch_from_slim_dataset(dataset, batch_size=BATCH_SIZE)\n", | |
| " predictions, _ = regression_model(inputs, is_training=is_training)\n", | |
| " \n", | |
| " ###\n", | |
| " ### train graph\n", | |
| " ###\n", | |
| " # with tf.variable_scope(\"train\"):\n", | |
| " loss = tf.losses.mean_squared_error(labels=labels, predictions=predictions)\n", | |
| " \n", | |
| " ### train_op\n", | |
| " total_loss = tf.losses.get_total_loss() # excludes loss_collection=\"validation\"\n", | |
| " optimizer = tf.train.AdamOptimizer(learning_rate=0.005)\n", | |
| " train_op = slim.learning.create_train_op(total_loss, optimizer)\n", | |
| "\n", | |
| " # tboard.launch_tensorboard(log_dir=LOG_DIR)\n", | |
| "\n", | |
| " # train summaries\n", | |
| " tf.summary.scalar(\"train/total_loss\", total_loss)\n", | |
| " tf.summary.scalar(\"train/loss\", loss)\n", | |
| " tf.summary.scalar(\"mse_loss\", loss)\n", | |
| " \n", | |
| " train_writer = tf.summary.FileWriter(os.path.join(log_dir, \"train\"))\n", | |
| "\n", | |
| " final_loss = slim.learning.train(\n", | |
| " train_op,\n", | |
| " global_step=global_step,\n", | |
| " # train_step_fn=train_step_fn,\n", | |
| " logdir=log_dir,\n", | |
| " number_of_steps=steps,\n", | |
| " # summary_op=train_summary_op,\n", | |
| " summary_writer=train_writer,\n", | |
| " save_summaries_secs=10,\n", | |
| " save_interval_secs=10,\n", | |
| " log_every_n_steps=LOG_INTERVAL,\n", | |
| " )\n", | |
| " \n", | |
| " print(\"Finished {}. Last batch loss:{}\".format(\"train\", final_loss))\n", | |
| " print(\"Checkpoint saved in {}\\n\".format(log_dir))" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "id": "g1UxtLuFJz26", | |
| "colab_type": "code", | |
| "colab": { | |
| "autoexec": { | |
| "startup": false, | |
| "wait_interval": 0 | |
| } | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "\n", | |
| "\n", | |
| "def validate_one_loop(label=\"simple\", epochs=1):\n", | |
| "\n", | |
| " ## derived values\n", | |
| " dataset = validation_dataset\n", | |
| " is_training = False\n", | |
| " steps = epochs * int(dataset.num_samples / BATCH_SIZE)\n", | |
| " log_dir = os.path.join(LOG_DIR, label)\n", | |
| " \n", | |
| " tf.logging.set_verbosity(tf.logging.WARN)\n", | |
| "\n", | |
| " with tf.Graph().as_default():\n", | |
| " ### get inputs, labels from slim.data.dataset.Dataset()\n", | |
| " inputs, labels = load_batch_from_slim_dataset(dataset, batch_size=BATCH_SIZE)\n", | |
| " predictions, _ = regression_model(inputs, is_training=is_training)\n", | |
| "\n", | |
| " # tboard.launch_tensorboard(log_dir=LOG_DIR)\n", | |
| "\n", | |
| " # Run the training inside a session.\n", | |
| " names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({\n", | |
| " # \"accuracy\": slim.metrics.accuracy(predictions, labels),\n", | |
| " \"validation/loss\": tf.metrics.mean_squared_error(predictions, labels),\n", | |
| " \"mse_loss\": tf.metrics.mean_squared_error(predictions, labels),\n", | |
| " })\n", | |
| "\n", | |
| " eval_op = list(names_to_updates.values())\n", | |
| "\n", | |
| " for metric_name, metric_value in names_to_values.items():\n", | |
| " # print(\">> summary name={}, value={}\".format(metric_name, metric_value))\n", | |
| " tf.summary.scalar(metric_name, metric_value)\n", | |
| "\n", | |
| " checkpoint_path = tf.train.latest_checkpoint(log_dir)\n", | |
| " summary_dir = os.path.join(log_dir, \"validate\")\n", | |
| " slim.evaluation.evaluate_once( '', checkpoint_path, summary_dir,\n", | |
| " num_evals=steps,\n", | |
| " eval_op=eval_op,\n", | |
| " )\n", | |
| " print(\"Finished {}\".format(\"validate\"))\n", | |
| "\n", | |
| " print(\"finished with evaluation\\n\\n\") " | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "id": "Td8KlBaYJ9LV", | |
| "colab_type": "code", | |
| "colab": { | |
| "autoexec": { | |
| "startup": false, | |
| "wait_interval": 0 | |
| }, | |
| "output_extras": [ | |
| { | |
| "item_id": 27 | |
| }, | |
| { | |
| "item_id": 31 | |
| } | |
| ], | |
| "base_uri": "https://localhost:8080/", | |
| "height": 1632 | |
| }, | |
| "outputId": "17819e98-adb0-4153-bf48-955b4fcea851", | |
| "executionInfo": { | |
| "status": "ok", | |
| "timestamp": 1520087746230, | |
| "user_tz": -480, | |
| "elapsed": 28500, | |
| "user": { | |
| "displayName": "michael lin", | |
| "photoUrl": "//lh3.googleusercontent.com/-etfWG7MvQwk/AAAAAAAAAAI/AAAAAAAAADM/BxW0OLTdkjI/s50-c-k-no/photo.jpg", | |
| "userId": "111539764795298113840" | |
| } | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "def train(loops=1):\n", | |
| " for _ in range(loops):\n", | |
| " train_one_loop(epochs=5)\n", | |
| " validate_one_loop(epochs=1)\n", | |
| " \n", | |
| " \n", | |
| "tboard.launch_tensorboard(bin_dir=\"/tmp\", log_dir=LOG_DIR) \n", | |
| "train(loops=5)" | |
| ], | |
| "execution_count": 29, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "ngrok installed\n", | |
| "status: tensorboard=True, ngrok=True\n", | |
| "tensorboard url= http://ced5fc3a.ngrok.io\n", | |
| "INFO:tensorflow:Restoring parameters from /tmp/tensorboard/simple/model.ckpt-1600\n", | |
| "INFO:tensorflow:Running local_init_op.\n", | |
| "INFO:tensorflow:Done running local_init_op.\n", | |
| "INFO:tensorflow:Starting Session.\n", | |
| "INFO:tensorflow:Saving checkpoint to path /tmp/tensorboard/simple/model.ckpt\n", | |
| "INFO:tensorflow:Starting Queues.\n", | |
| "INFO:tensorflow:global_step/sec: 0\n", | |
| "INFO:tensorflow:Recording summary at step 1600.\n", | |
| "INFO:tensorflow:global step 1700: loss = 0.3564 (0.017 sec/step)\n", | |
| "INFO:tensorflow:global step 1800: loss = 0.4546 (0.017 sec/step)\n", | |
| "INFO:tensorflow:Stopping Training.\n", | |
| "INFO:tensorflow:Finished training! Saving model to disk.\n", | |
| "Finished train. Last batch loss:0.4546107053756714\n", | |
| "Checkpoint saved in /tmp/tensorboard/simple\n", | |
| "\n", | |
| "Finished validate\n", | |
| "finished with evaluation\n", | |
| "\n", | |
| "\n", | |
| "INFO:tensorflow:Restoring parameters from /tmp/tensorboard/simple/model.ckpt-1800\n", | |
| "INFO:tensorflow:Running local_init_op.\n", | |
| "INFO:tensorflow:Done running local_init_op.\n", | |
| "INFO:tensorflow:Starting Session.\n", | |
| "INFO:tensorflow:Saving checkpoint to path /tmp/tensorboard/simple/model.ckpt\n", | |
| "INFO:tensorflow:global_step/sec: 0\n", | |
| "INFO:tensorflow:Starting Queues.\n", | |
| "INFO:tensorflow:Recording summary at step 1800.\n", | |
| "INFO:tensorflow:global step 1900: loss = 0.3071 (0.014 sec/step)\n", | |
| "INFO:tensorflow:global step 2000: loss = 0.4565 (0.018 sec/step)\n", | |
| "INFO:tensorflow:Stopping Training.\n", | |
| "INFO:tensorflow:Finished training! Saving model to disk.\n", | |
| "Finished train. Last batch loss:0.4565228521823883\n", | |
| "Checkpoint saved in /tmp/tensorboard/simple\n", | |
| "\n", | |
| "Finished validate\n", | |
| "finished with evaluation\n", | |
| "\n", | |
| "\n", | |
| "INFO:tensorflow:Restoring parameters from /tmp/tensorboard/simple/model.ckpt-2000\n", | |
| "INFO:tensorflow:Running local_init_op.\n", | |
| "INFO:tensorflow:Done running local_init_op.\n", | |
| "INFO:tensorflow:Starting Session.\n", | |
| "INFO:tensorflow:Saving checkpoint to path /tmp/tensorboard/simple/model.ckpt\n", | |
| "INFO:tensorflow:Starting Queues.\n", | |
| "INFO:tensorflow:Recording summary at step 2000.\n", | |
| "INFO:tensorflow:global step 2100: loss = 0.2760 (0.016 sec/step)\n", | |
| "INFO:tensorflow:global step 2200: loss = 0.4507 (0.014 sec/step)\n", | |
| "INFO:tensorflow:Stopping Training.\n", | |
| "INFO:tensorflow:Finished training! Saving model to disk.\n", | |
| "Finished train. Last batch loss:0.4506738483905792\n", | |
| "Checkpoint saved in /tmp/tensorboard/simple\n", | |
| "\n", | |
| "Finished validate\n", | |
| "finished with evaluation\n", | |
| "\n", | |
| "\n", | |
| "INFO:tensorflow:Restoring parameters from /tmp/tensorboard/simple/model.ckpt-2200\n", | |
| "INFO:tensorflow:Running local_init_op.\n", | |
| "INFO:tensorflow:Done running local_init_op.\n", | |
| "INFO:tensorflow:Starting Session.\n", | |
| "INFO:tensorflow:Saving checkpoint to path /tmp/tensorboard/simple/model.ckpt\n", | |
| "INFO:tensorflow:Starting Queues.\n", | |
| "INFO:tensorflow:Recording summary at step 2200.\n", | |
| "INFO:tensorflow:global step 2300: loss = 0.2935 (0.016 sec/step)\n", | |
| "INFO:tensorflow:global step 2400: loss = 0.3863 (0.017 sec/step)\n", | |
| "INFO:tensorflow:Stopping Training.\n", | |
| "INFO:tensorflow:Finished training! Saving model to disk.\n", | |
| "Finished train. Last batch loss:0.386338472366333\n", | |
| "Checkpoint saved in /tmp/tensorboard/simple\n", | |
| "\n", | |
| "Finished validate\n", | |
| "finished with evaluation\n", | |
| "\n", | |
| "\n", | |
| "INFO:tensorflow:Restoring parameters from /tmp/tensorboard/simple/model.ckpt-2400\n", | |
| "INFO:tensorflow:Running local_init_op.\n", | |
| "INFO:tensorflow:Done running local_init_op.\n" | |
| ], | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "INFO:tensorflow:Starting Session.\n", | |
| "INFO:tensorflow:Saving checkpoint to path /tmp/tensorboard/simple/model.ckpt\n", | |
| "INFO:tensorflow:Starting Queues.\n", | |
| "INFO:tensorflow:Recording summary at step 2400.\n", | |
| "INFO:tensorflow:global step 2500: loss = 0.3667 (0.017 sec/step)\n", | |
| "INFO:tensorflow:global step 2600: loss = 0.3626 (0.016 sec/step)\n", | |
| "INFO:tensorflow:Stopping Training.\n", | |
| "INFO:tensorflow:Finished training! Saving model to disk.\n", | |
| "Finished train. Last batch loss:0.3626402020454407\n", | |
| "Checkpoint saved in /tmp/tensorboard/simple\n", | |
| "\n", | |
| "Finished validate\n", | |
| "finished with evaluation\n", | |
| "\n", | |
| "\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "id": "Fa6Fv5oa4F_K", | |
| "colab_type": "code", | |
| "colab": { | |
| "autoexec": { | |
| "startup": false, | |
| "wait_interval": 0 | |
| } | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment