{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "execution": {},
    "id": "view-in-github"
   },
   "source": [
    "<a href=\"https://colab.research.google.com/github/NeuromatchAcademy/course-content-dl/blob/main/tutorials/W1D5_Optimization/student/W1D5_Tutorial1.ipynb\" target=\"_blank\"><img alt=\"Open In Colab\" src=\"https://colab.research.google.com/assets/colab-badge.svg\"/></a>   <a href=\"https://kaggle.com/kernels/welcome?src=https://raw.githubusercontent.com/NeuromatchAcademy/course-content-dl/main/tutorials/W1D5_Optimization/student/W1D5_Tutorial1.ipynb\" target=\"_blank\"><img alt=\"Open in Kaggle\" src=\"https://kaggle.com/static/images/open-in-kaggle.svg\"/></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "# Tutorial 1: Optimization techniques\n",
    "\n",
    "**Week 1, Day 5: Optimization**\n",
    "\n",
    "**By Neuromatch Academy**\n",
    "\n",
    "__Content creators:__ Jose Gallego-Posada, Ioannis Mitliagkas\n",
    "\n",
    "__Content reviewers:__ Piyush Chauhan, Vladimir Haltakov, Siwei Bai, Kelson Shilling-Scrivo\n",
    "\n",
    "__Content editors:__ Charles J Edelson, Gagana B, Spiros Chavlis\n",
    "\n",
    "__Production editors:__ Arush Tagade, R. Krishnakumaran, Gagana B, Spiros Chavlis"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "---\n",
    "# Tutorial Objectives\n",
    "\n",
    "Objectives:\n",
    "*   Necessity and importance of optimization\n",
    "*   Introduction to commonly used optimization techniques\n",
    "*   Optimization in non-convex loss landscapes\n",
    "*   'Adaptive' hyperparameter tuning\n",
    "*   Ethical concerns\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "remove-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @markdown\n",
    "from IPython.display import IFrame\n",
    "from ipywidgets import widgets\n",
    "out = widgets.Output()\n",
    "with out:\n",
    "    print(f\"If you want to download the slides: https://osf.io/download/ft2sz/\")\n",
    "    display(IFrame(src=f\"https://mfr.ca-1.osf.io/render?url=https://osf.io/ft2sz/?direct%26mode=render%26action=download%26mode=render\", width=730, height=410))\n",
    "display(out)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "---\n",
    "# Setup"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Install and import feedback gadget\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Install and import feedback gadget\n",
    "\n",
    "!pip3 install vibecheck datatops --quiet\n",
    "\n",
    "from vibecheck import DatatopsContentReviewContainer\n",
    "def content_review(notebook_section: str):\n",
    "    return DatatopsContentReviewContainer(\n",
    "        \"\",  # No text prompt\n",
    "        notebook_section,\n",
    "        {\n",
    "            \"url\": \"https://pmyvdlilci.execute-api.us-east-1.amazonaws.com/klab\",\n",
    "            \"name\": \"neuromatch_dl\",\n",
    "            \"user_key\": \"f379rz8y\",\n",
    "        },\n",
    "    ).render()\n",
    "\n",
    "\n",
    "feedback_prefix = \"W1D5_T1\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {}
   },
   "outputs": [],
   "source": [
    "# Imports\n",
    "import copy\n",
    "\n",
    "import ipywidgets as widgets\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "import time\n",
    "import torch\n",
    "import torchvision\n",
    "import torchvision.datasets as datasets\n",
    "import torch.nn.functional as F\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim\n",
    "from tqdm.auto import tqdm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Figure settings\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Figure settings\n",
    "import logging\n",
    "logging.getLogger('matplotlib.font_manager').disabled = True\n",
    "\n",
    "import ipywidgets as widgets  # interactive display\n",
    "%config InlineBackend.figure_format = 'retina'\n",
    "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/content-creation/main/nma.mplstyle\")\n",
    "plt.rc('axes', unicode_minus=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Helper functions\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Helper functions\n",
    "def print_params(model):\n",
    "  \"\"\"\n",
    "  Lists the name and current value of the model's\n",
    "  named parameters\n",
    "\n",
    "  Args:\n",
    "    model: an nn.Module inherited model\n",
    "      Represents the ML/DL model\n",
    "\n",
    "  Returns:\n",
    "    Nothing\n",
    "  \"\"\"\n",
    "  for name, param in model.named_parameters():\n",
    "    if param.requires_grad:\n",
    "      print(name, param.data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Set random seed\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Executing `set_seed(seed=seed)` you are setting the seed\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Set random seed\n",
    "\n",
    "# @markdown Executing `set_seed(seed=seed)` you are setting the seed\n",
    "\n",
    "# for DL its critical to set the random seed so that students can have a\n",
    "# baseline to compare their results to expected results.\n",
    "# Read more here: https://pytorch.org/docs/stable/notes/randomness.html\n",
    "\n",
    "# Call the `set_seed` function in the exercises to ensure reproducibility.\n",
    "import random\n",
    "import torch\n",
    "\n",
    "def set_seed(seed=None, seed_torch=True):\n",
    "  \"\"\"\n",
    "  Handles variability by controlling sources of randomness\n",
    "  through set seed values\n",
    "\n",
    "  Args:\n",
    "    seed: Integer\n",
    "      Set the seed value to given integer.\n",
    "      If no seed, set seed value to random integer in the range 2^32\n",
    "    seed_torch: Bool\n",
    "      Seeds the random number generator for all devices to\n",
    "      offer some guarantees on reproducibility\n",
    "\n",
    "  Returns:\n",
    "    Nothing\n",
    "  \"\"\"\n",
    "  if seed is None:\n",
    "    seed = np.random.choice(2 ** 32)\n",
    "  random.seed(seed)\n",
    "  np.random.seed(seed)\n",
    "  if seed_torch:\n",
    "    torch.manual_seed(seed)\n",
    "    torch.cuda.manual_seed_all(seed)\n",
    "    torch.cuda.manual_seed(seed)\n",
    "    torch.backends.cudnn.benchmark = False\n",
    "    torch.backends.cudnn.deterministic = True\n",
    "  print(f'Random seed {seed} has been set.')\n",
    "\n",
    "\n",
    "# In case that `DataLoader` is used\n",
    "def seed_worker(worker_id):\n",
    "  \"\"\"\n",
    "  DataLoader will reseed workers following randomness in\n",
    "  multi-process data loading algorithm.\n",
    "\n",
    "  Args:\n",
    "    worker_id: integer\n",
    "      ID of subprocess to seed. 0 means that\n",
    "      the data will be loaded in the main process\n",
    "      Refer: https://pytorch.org/docs/stable/data.html#data-loading-randomness for more details\n",
    "\n",
    "  Returns:\n",
    "    Nothing\n",
    "  \"\"\"\n",
    "  worker_seed = torch.initial_seed() % 2**32\n",
    "  np.random.seed(worker_seed)\n",
    "  random.seed(worker_seed)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Set device (GPU or CPU). Execute `set_device()`\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Set device (GPU or CPU). Execute `set_device()`\n",
    "# especially if torch modules are used.\n",
    "\n",
    "# inform the user if the notebook uses GPU or CPU.\n",
    "\n",
    "def set_device():\n",
    "  \"\"\"\n",
    "  Set the device. CUDA if available, CPU otherwise\n",
    "\n",
    "  Args:\n",
    "    None\n",
    "\n",
    "  Returns:\n",
    "    Nothing\n",
    "  \"\"\"\n",
    "  device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "  if device != \"cuda\":\n",
    "    print(\"WARNING: For this notebook to perform best, \"\n",
    "        \"if possible, in the menu under `Runtime` -> \"\n",
    "        \"`Change runtime type.`  select `GPU` \")\n",
    "  else:\n",
    "    print(\"GPU is enabled in this notebook.\")\n",
    "\n",
    "  return device"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {}
   },
   "outputs": [],
   "source": [
    "SEED = 2021\n",
    "set_seed(seed=SEED)\n",
    "DEVICE = set_device()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "---\n",
    "# Section 1. Introduction\n",
    "\n",
    "*Time estimate: ~15 mins*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Video 1: Introduction\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "remove-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Video 1: Introduction\n",
    "from ipywidgets import widgets\n",
    "from IPython.display import YouTubeVideo\n",
    "from IPython.display import IFrame\n",
    "from IPython.display import display\n",
    "\n",
    "\n",
    "class PlayVideo(IFrame):\n",
    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
    "    self.id = id\n",
    "    if source == 'Bilibili':\n",
    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
    "    elif source == 'Osf':\n",
    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
    "\n",
    "\n",
    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
    "  tab_contents = []\n",
    "  for i, video_id in enumerate(video_ids):\n",
    "    out = widgets.Output()\n",
    "    with out:\n",
    "      if video_ids[i][0] == 'Youtube':\n",
    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
    "                             height=H, fs=fs, rel=0)\n",
    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
    "      else:\n",
    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
    "                          height=H, fs=fs, autoplay=False)\n",
    "        if video_ids[i][0] == 'Bilibili':\n",
    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
    "        elif video_ids[i][0] == 'Osf':\n",
    "          print(f'Video available at https://osf.io/{video.id}')\n",
    "      display(video)\n",
    "    tab_contents.append(out)\n",
    "  return tab_contents\n",
    "\n",
    "\n",
    "video_ids = [('Youtube', 'zm9oekdkJbQ'), ('Bilibili', 'BV1VB4y1K7Vr')]\n",
    "tab_contents = display_videos(video_ids, W=730, H=410)\n",
    "tabs = widgets.Tab()\n",
    "tabs.children = tab_contents\n",
    "for i in range(len(tab_contents)):\n",
    "  tabs.set_title(i, video_ids[i][0])\n",
    "display(tabs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Introduction_Video\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Discuss: Unexpected consequences\n",
    "\n",
    "Can you think of examples from your own experience/life where poorly chosen incentives or objectives have led to unexpected consequences?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "execution": {}
   },
   "source": [
    "[*Click for solution*](https://github.com/NeuromatchAcademy/course-content-dl/tree/main/tutorials/W1D5_Optimization/solutions/W1D5_Tutorial1_Solution_1ecffd5a.py)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Unexpected_consequences_Discussion\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "---\n",
    "# Section 2: Case study: successfully training an MLP for image classification\n",
    "\n",
    "*Time estimate: ~40 mins*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "Many of the core ideas (and tricks) in modern optimization for deep learning can be illustrated in the simple setting of training an MLP to solve an image classification task. In this tutorial we will guide you through the key challenges that arise when optimizing high-dimensional, non-convex$^\\dagger$ problems. We will use these challenges to motivate and explain some commonly used solutions.\n",
    "\n",
    "**Disclaimer:** Some of the functions you will code in this tutorial are already implemented in Pytorch and many other libraries. For pedagogical reasons, we decided to bring these simple coding tasks into the spotlight and place a relatively higher emphasis in your understanding of the algorithms, rather than the use of a specific library.\n",
    "\n",
    "In 'day-to-day' research projects you will likely rely on the community-vetted, optimized libraries rather than the 'manual implementations' you will write today. In Section 8 you will have a chance to 'put it all together' and use the full power of Pytorch to tune the parameters of an MLP to classify handwritten digits."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "$^\\dagger$: A **strictly convex** function has the same global and local minimum - a nice property for optimization as it won't get stuck in a local minimum that isn't a global one (e.g., $f(x)=x^2 + 2x + 1$). A **non-convex** function is wavy - has some 'valleys' (local minima) that aren't as deep as the overall deepest 'valley' (global minimum). Thus, the optimization algorithms can get stuck in the local minimum, and it can be hard to tell when this happens (e.g., $f(x) = x^4 + x^3 - 2x^2 - 2x$). See also **Section 5** for more details."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Video 2: Case Study - MLP Classification\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "remove-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Video 2: Case Study - MLP Classification\n",
    "from ipywidgets import widgets\n",
    "from IPython.display import YouTubeVideo\n",
    "from IPython.display import IFrame\n",
    "from IPython.display import display\n",
    "\n",
    "\n",
    "class PlayVideo(IFrame):\n",
    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
    "    self.id = id\n",
    "    if source == 'Bilibili':\n",
    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
    "    elif source == 'Osf':\n",
    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
    "\n",
    "\n",
    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
    "  tab_contents = []\n",
    "  for i, video_id in enumerate(video_ids):\n",
    "    out = widgets.Output()\n",
    "    with out:\n",
    "      if video_ids[i][0] == 'Youtube':\n",
    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
    "                             height=H, fs=fs, rel=0)\n",
    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
    "      else:\n",
    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
    "                          height=H, fs=fs, autoplay=False)\n",
    "        if video_ids[i][0] == 'Bilibili':\n",
    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
    "        elif video_ids[i][0] == 'Osf':\n",
    "          print(f'Video available at https://osf.io/{video.id}')\n",
    "      display(video)\n",
    "    tab_contents.append(out)\n",
    "  return tab_contents\n",
    "\n",
    "\n",
    "video_ids = [('Youtube', 'pJc2ENhYbqA'), ('Bilibili', 'BV1GB4y1K7Ha')]\n",
    "tab_contents = display_videos(video_ids, W=730, H=410)\n",
    "tabs = widgets.Tab()\n",
    "tabs.children = tab_contents\n",
    "for i in range(len(tab_contents)):\n",
    "  tabs.set_title(i, video_ids[i][0])\n",
    "display(tabs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Case_study_MLP_classification_Video\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Section 2.1: Data\n",
    "\n",
    "We will use the MNIST dataset of handwritten digits. We load the data via the Pytorch `datasets` module, as you learned in W1D1.\n",
    "\n",
    "**Note:** Although we can download the MNIST dataset directly from `datasets` using the optional argument `download=True`, we are going to download them from NMA directory on OSF to ensure network reliability.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Download MNIST dataset\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Download MNIST dataset\n",
    "import tarfile, requests, os\n",
    "\n",
    "fname = 'MNIST.tar.gz'\n",
    "name = 'MNIST'\n",
    "url = 'https://osf.io/y2fj6/download'\n",
    "\n",
    "if not os.path.exists(name):\n",
    "  print('\\nDownloading MNIST dataset...')\n",
    "  r = requests.get(url, allow_redirects=True)\n",
    "  with open(fname, 'wb') as fh:\n",
    "    fh.write(r.content)\n",
    "  print('\\nDownloading MNIST completed.')\n",
    "\n",
    "if not os.path.exists(name):\n",
    "  with tarfile.open(fname) as tar:\n",
    "    tar.extractall()\n",
    "    os.remove(fname)\n",
    "else:\n",
    "  print('MNIST dataset has been downloaded.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {}
   },
   "outputs": [],
   "source": [
    "def load_mnist_data(change_tensors=False, download=False):\n",
    "  \"\"\"\n",
    "  Load training and test examples for the MNIST handwritten digits dataset\n",
    "  with every image: 28*28 x 1 channel (greyscale image)\n",
    "\n",
    "  Args:\n",
    "    change_tensors: Bool\n",
    "      Argument to check if tensors need to be normalised\n",
    "    download: Bool\n",
    "      Argument to check if dataset needs to be downloaded/already exists\n",
    "\n",
    "  Returns:\n",
    "    train_set:\n",
    "      train_data: Tensor\n",
    "        training input tensor of size (train_size x 784)\n",
    "      train_target: Tensor\n",
    "        training 0-9 integer label tensor of size (train_size)\n",
    "    test_set:\n",
    "      test_data: Tensor\n",
    "        test input tensor of size (test_size x 784)\n",
    "      test_target: Tensor\n",
    "        training 0-9 integer label tensor of size (test_size)\n",
    "  \"\"\"\n",
    "  # Load train and test sets\n",
    "  train_set = datasets.MNIST(root='.', train=True, download=download,\n",
    "                             transform=torchvision.transforms.ToTensor())\n",
    "  test_set = datasets.MNIST(root='.', train=False, download=download,\n",
    "                            transform=torchvision.transforms.ToTensor())\n",
    "\n",
    "  # Original data is in range [0, 255]. We normalize the data wrt its mean and std_dev.\n",
    "  # Note that we only used *training set* information to compute mean and std\n",
    "  mean = train_set.data.float().mean()\n",
    "  std = train_set.data.float().std()\n",
    "\n",
    "  if change_tensors:\n",
    "    # Apply normalization directly to the tensors containing the dataset\n",
    "    train_set.data = (train_set.data.float() - mean) / std\n",
    "    test_set.data = (test_set.data.float() - mean) / std\n",
    "  else:\n",
    "    tform = torchvision.transforms.Compose([torchvision.transforms.ToTensor(),\n",
    "                                            torchvision.transforms.Normalize(mean=[mean / 255.], std=[std / 255.])\n",
    "                                            ])\n",
    "    train_set = datasets.MNIST(root='.', train=True, download=download,\n",
    "                               transform=tform)\n",
    "    test_set = datasets.MNIST(root='.', train=False, download=download,\n",
    "                              transform=tform)\n",
    "\n",
    "  return train_set, test_set\n",
    "\n",
    "\n",
    "train_set, test_set = load_mnist_data(change_tensors=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "As we are just getting started, we will concentrate on a small subset of only 500 examples out of the 60.000 data points contained in the whole training set.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {}
   },
   "outputs": [],
   "source": [
    "# Sample a random subset of 500 indices\n",
    "subset_index = np.random.choice(len(train_set.data), 500)\n",
    "\n",
    "# We will use these symbols to represent the training data and labels, to stay\n",
    "# as close to the mathematical expressions as possible.\n",
    "X, y = train_set.data[subset_index, :], train_set.targets[subset_index]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "Run the following cell to visualize the content of three examples in our training set. Note how the preprocessing we applied to the data changes the range of pixel values after normalization.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Run me!\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Run me!\n",
    "\n",
    "# Exploratory data analysis and visualisation\n",
    "\n",
    "num_figures = 3\n",
    "fig, axs = plt.subplots(1, num_figures, figsize=(5 * num_figures, 5))\n",
    "\n",
    "for sample_id, ax in enumerate(axs):\n",
    "  # Plot the pixel values for each image\n",
    "  ax.matshow(X[sample_id, :], cmap='gray_r')\n",
    "  # 'Write' the pixel value in the corresponding location\n",
    "  for (i, j), z in np.ndenumerate(X[sample_id, :]):\n",
    "    text = '{:.1f}'.format(z)\n",
    "    ax.text(j, i, text, ha='center',\n",
    "            va='center', fontsize=6, c='steelblue')\n",
    "\n",
    "  ax.set_title('Label: ' + str(y[sample_id].item()))\n",
    "  ax.axis('off')\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Section 2.2: Model\n",
    "\n",
    "As you will see next week, there are specific model architectures that are better suited to image-like data, such as Convolutional Neural Networks (CNNs). For simplicity, in this tutorial we will focus exclusively on Multi-Layer Perceptron (MLP) models as they allow us to highlight many important optimization challenges shared with more advanced neural network designs."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {}
   },
   "outputs": [],
   "source": [
    "class MLP(nn.Module):\n",
    "  \"\"\"\n",
    "  This class implements MLPs in Pytorch of an arbitrary number of hidden\n",
    "  layers of potentially different sizes. Since we concentrate on classification\n",
    "  tasks in this tutorial, we have a log_softmax layer at prediction time.\n",
    "  \"\"\"\n",
    "\n",
    "  def __init__(self, in_dim=784, out_dim=10, hidden_dims=[], use_bias=True):\n",
    "    \"\"\"\n",
    "    Constructs a MultiLayerPerceptron\n",
    "\n",
    "    Args:\n",
    "      in_dim: Integer\n",
    "        dimensionality of input data (784)\n",
    "      out_dim: Integer\n",
    "        number of classes (10)\n",
    "      hidden_dims: List\n",
    "        containing the dimensions of the hidden layers,\n",
    "        empty list corresponds to a linear model (in_dim, out_dim)\n",
    "\n",
    "    Returns:\n",
    "      Nothing\n",
    "    \"\"\"\n",
    "\n",
    "    super(MLP, self).__init__()\n",
    "\n",
    "    self.in_dim = in_dim\n",
    "    self.out_dim = out_dim\n",
    "\n",
    "    # If we have no hidden layer, just initialize a linear model (e.g. in logistic regression)\n",
    "    if len(hidden_dims) == 0:\n",
    "      layers = [nn.Linear(in_dim, out_dim, bias=use_bias)]\n",
    "    else:\n",
    "      # 'Actual' MLP with dimensions in_dim - num_hidden_layers*[hidden_dim] - out_dim\n",
    "      layers = [nn.Linear(in_dim, hidden_dims[0], bias=use_bias), nn.ReLU()]\n",
    "\n",
    "      # Loop until before the last layer\n",
    "      for i, hidden_dim in enumerate(hidden_dims[:-1]):\n",
    "        layers += [nn.Linear(hidden_dim, hidden_dims[i + 1], bias=use_bias),\n",
    "                   nn.ReLU()]\n",
    "\n",
    "      # Add final layer to the number of classes\n",
    "      layers += [nn.Linear(hidden_dims[-1], out_dim, bias=use_bias)]\n",
    "\n",
    "    self.main = nn.Sequential(*layers)\n",
    "\n",
    "  def forward(self, x):\n",
    "    \"\"\"\n",
    "    Defines the network structure and flow from input to output\n",
    "\n",
    "    Args:\n",
    "      x: Tensor\n",
    "        Image to be processed by the network\n",
    "\n",
    "    Returns:\n",
    "      output: Tensor\n",
    "        same dimension and shape as the input with probabilistic values in the range [0, 1]\n",
    "\n",
    "    \"\"\"\n",
    "    # Flatten each images into a 'vector'\n",
    "    transformed_x = x.view(-1, self.in_dim)\n",
    "    hidden_output = self.main(transformed_x)\n",
    "    output = F.log_softmax(hidden_output, dim=1)\n",
    "    return output"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "Linear models constitute a very special kind of MLPs: they are equivalent to an MLP with *zero* hidden layers. This is simply an affine transformation, in other words a 'linear' map $W x$ with an 'offset' $b$; followed by a softmax function.\n",
    "\n",
    "$$f(x) = \\text{softmax}(W x + b)$$\n",
    "\n",
    "Here $x \\in \\mathbb{R}^{784}$, $W \\in \\mathbb{R}^{10 \\times 784}$ and $b \\in \\mathbb{R}^{10}$. Notice that the dimensions of the weight matrix are $10 \\times 784$ as the input tensors are flattened images, i.e., $28 \\times 28 = 784$-dimensional tensors and the output layer consists of $10$ nodes. Also, note that the implementation of softmax encapsulates b in W i.e., It maps the rows of the input instead of the columns. That is, the i’th row of the output is the mapping of the i’th row of the input under W, plus the bias term. Refer Affine maps here: https://pytorch.org/tutorials/beginner/nlp/deep_learning_tutorial.html#affine-maps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {}
   },
   "outputs": [],
   "source": [
    "# Empty hidden_dims means we take a model with zero hidden layers.\n",
    "model = MLP(in_dim=784, out_dim=10, hidden_dims=[])\n",
    "\n",
    "# We print the model structure with 784 inputs and 10 outputs\n",
    "print(model)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Section 2.3: Loss\n",
    "\n",
    "While we care about the accuracy of the model, the 'discrete' nature of the 0-1 loss makes it challenging to optimize. In order to learn good parameters for this model, we will use the cross entropy loss (negative log-likelihood), which you saw in the last lecture, as a surrogate objective to be minimized.\n",
    "\n",
    "This particular choice of model and optimization objective leads to a *convex* optimization problem with respect to the parameters $W$ and $b$."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {}
   },
   "outputs": [],
   "source": [
    "loss_fn = F.nll_loss"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Section 2.4: Interpretability"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "In the last lecture, you saw that inspecting the weights of a model can provide insights on what 'concepts' the model has learned. Here we show the weights of a partially trained model. The weights corresponding to each class 'learn' to _fire_ when an input of the class is detected.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "#@markdown Run _this cell_ to train the model. If you are curious about how the training\n",
    "#@markdown takes place, double-click this cell to find out. At the end of this tutorial\n",
    "#@markdown you will have the opportunity to train a more complex model on your own.\n",
    "\n",
    "cell_verbose = False\n",
    "partial_trained_model = MLP(in_dim=784, out_dim=10, hidden_dims=[])\n",
    "\n",
    "if cell_verbose:\n",
    "  print('Init loss', loss_fn(partial_trained_model(X), y).item()) # This matches around np.log(10 = # of classes)\n",
    "\n",
    "# Invoke an optimizer using Adaptive gradient and Momentum (more about this in Section 7)\n",
    "optimizer = optim.Adam(partial_trained_model.parameters(), lr=7e-4)\n",
    "for _ in range(200):\n",
    "  loss = loss_fn(partial_trained_model(X), y)\n",
    "  optimizer.zero_grad()\n",
    "  loss.backward()\n",
    "  optimizer.step()\n",
    "\n",
    "if cell_verbose:\n",
    "  print('End loss', loss_fn(partial_trained_model(X), y).item()) # This should be less than 1e-2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {}
   },
   "outputs": [],
   "source": [
    "# Show class filters of a trained model\n",
    "W = partial_trained_model.main[0].weight.data.numpy()\n",
    "\n",
    "fig, axs = plt.subplots(1, 10, figsize=(15, 4))\n",
    "for class_id in range(10):\n",
    "  axs[class_id].imshow(W[class_id, :].reshape(28, 28), cmap='gray_r')\n",
    "  axs[class_id].axis('off')\n",
    "  axs[class_id].set_title('Class ' + str(class_id) )\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "---\n",
    "# Section 3: High dimensional search\n",
    "\n",
    "*Time estimate: ~25 mins*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "We now have a model with its corresponding trainable parameters as well as an objective to optimize. Where do we goto next? How do we find a 'good' configuration of parameters?\n",
    "\n",
    "One idea is to choose a random direction and move only if the objective is reduced. However, this is inefficient in high dimensions and you will see how gradient descent (with a suitable step-size) can guarantee consistent improvement in terms of the objective function."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Video 3: Optimization of an Objective Function\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "remove-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Video 3: Optimization of an Objective Function\n",
    "from ipywidgets import widgets\n",
    "from IPython.display import YouTubeVideo\n",
    "from IPython.display import IFrame\n",
    "from IPython.display import display\n",
    "\n",
    "\n",
    "class PlayVideo(IFrame):\n",
    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
    "    self.id = id\n",
    "    if source == 'Bilibili':\n",
    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
    "    elif source == 'Osf':\n",
    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
    "\n",
    "\n",
    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
    "  tab_contents = []\n",
    "  for i, video_id in enumerate(video_ids):\n",
    "    out = widgets.Output()\n",
    "    with out:\n",
    "      if video_ids[i][0] == 'Youtube':\n",
    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
    "                             height=H, fs=fs, rel=0)\n",
    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
    "      else:\n",
    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
    "                          height=H, fs=fs, autoplay=False)\n",
    "        if video_ids[i][0] == 'Bilibili':\n",
    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
    "        elif video_ids[i][0] == 'Osf':\n",
    "          print(f'Video available at https://osf.io/{video.id}')\n",
    "      display(video)\n",
    "    tab_contents.append(out)\n",
    "  return tab_contents\n",
    "\n",
    "\n",
    "video_ids = [('Youtube', 'aSJTRdjRvvw'), ('Bilibili', 'BV1aL411H7Ce')]\n",
    "tab_contents = display_videos(video_ids, W=730, H=410)\n",
    "tabs = widgets.Tab()\n",
    "tabs.children = tab_contents\n",
    "for i in range(len(tab_contents)):\n",
    "  tabs.set_title(i, video_ids[i][0])\n",
    "display(tabs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Optimization_of_an_Objective_Function_Video\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Coding Exercise 3: Implement gradient descent\n",
    "\n",
    "In this exercise you will use PyTorch automatic differentiation capabilities to compute the gradient of the loss with respect to the parameters of the model. You will then use these gradients to implement the update performed by the gradient descent method."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {}
   },
   "outputs": [],
   "source": [
    "def zero_grad(params):\n",
    "  \"\"\"\n",
    "  Clear gradients as they accumulate on successive backward calls\n",
    "\n",
    "  Args:\n",
    "    params: an iterator over tensors\n",
    "      i.e., updating the Weights and biases\n",
    "\n",
    "  Returns:\n",
    "    Nothing\n",
    "  \"\"\"\n",
    "  for par in params:\n",
    "    if not(par.grad is None):\n",
    "      par.grad.data.zero_()\n",
    "\n",
    "\n",
    "def random_update(model, noise_scale=0.1, normalized=False):\n",
    "  \"\"\"\n",
    "  Performs a random update on the parameters of the model to help\n",
    "  understand the effectiveness of updating random directions\n",
    "  for the problem of optimizing the parameters of a high-dimensional linear model.\n",
    "\n",
    "  Args:\n",
    "    model: nn.Module derived class\n",
    "      The model whose parameters are to be updated\n",
    "\n",
    "    noise_scale: float\n",
    "      Specifies the magnitude of random weight\n",
    "\n",
    "    normalized: Bool\n",
    "      Indicates if the parameter has been normalised or not\n",
    "\n",
    "  Returns:\n",
    "    Nothing\n",
    "  \"\"\"\n",
    "  for par in model.parameters():\n",
    "    noise = torch.randn_like(par)\n",
    "    if normalized:\n",
    "      noise /= torch.norm(noise)\n",
    "    par.data +=  noise_scale * noise"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "Let's implement the gradient descent!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {}
   },
   "outputs": [],
   "source": [
    "def gradient_update(loss, params, lr=1e-3):\n",
    "  \"\"\"\n",
    "  Perform a gradient descent update on a given loss over a collection of parameters\n",
    "\n",
    "  Args:\n",
    "    loss: Tensor\n",
    "      A scalar tensor containing the loss through which the gradient will be computed\n",
    "    params: List of iterables\n",
    "      Collection of parameters with respect to which we compute gradients\n",
    "    lr: Float\n",
    "      Scalar specifying the learning rate or step-size for the update\n",
    "\n",
    "  Returns:\n",
    "    Nothing\n",
    "  \"\"\"\n",
    "  # Clear up gradients as Pytorch automatically accumulates gradients from\n",
    "  # successive backward calls\n",
    "  zero_grad(params)\n",
    "\n",
    "  # Compute gradients on given objective\n",
    "  loss.backward()\n",
    "\n",
    "  with torch.no_grad():\n",
    "    for par in params:\n",
    "      #################################################\n",
    "      ## TODO for students: update the value of the parameter ##\n",
    "      raise NotImplementedError(\"Student exercise: implement gradient update\")\n",
    "      #################################################\n",
    "      # Here we work with the 'data' attribute of the parameter rather than the\n",
    "      # parameter itself.\n",
    "      # Hence - use the learning rate and the parameter's .grad.data attribute to perform an update\n",
    "      par.data -= ...\n",
    "\n",
    "\n",
    "set_seed(seed=SEED)\n",
    "model1 = MLP(in_dim=784, out_dim=10, hidden_dims=[])\n",
    "print('\\n The model1 parameters before the update are: \\n')\n",
    "print_params(model1)\n",
    "loss = loss_fn(model1(X), y)\n",
    "\n",
    "## Uncomment below to test your function\n",
    "# gradient_update(loss, list(model1.parameters()), lr=1e-1)\n",
    "# print('\\n The model1 parameters after the update are: \\n')\n",
    "# print_params(model1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "```\n",
    " The model1 parameters after the update are:\n",
    "\n",
    "main.0.weight tensor([[-0.0263,  0.0010,  0.0174,  ...,  0.0298,  0.0278, -0.0220],\n",
    "        [-0.0047, -0.0302, -0.0093,  ..., -0.0077,  0.0248, -0.0240],\n",
    "        [ 0.0234, -0.0237,  0.0335,  ...,  0.0117,  0.0263, -0.0187],\n",
    "        ...,\n",
    "        [-0.0006,  0.0156,  0.0110,  ...,  0.0143, -0.0302, -0.0145],\n",
    "        [ 0.0164,  0.0286,  0.0238,  ..., -0.0127, -0.0191,  0.0188],\n",
    "        [ 0.0206, -0.0354, -0.0184,  ..., -0.0272,  0.0098,  0.0002]])\n",
    "main.0.bias tensor([-0.0292, -0.0018,  0.0115, -0.0370,  0.0054,  0.0155,  0.0317,  0.0246,\n",
    "         0.0198, -0.0061])\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "execution": {}
   },
   "source": [
    "[*Click for solution*](https://github.com/NeuromatchAcademy/course-content-dl/tree/main/tutorials/W1D5_Optimization/solutions/W1D5_Tutorial1_Solution_e46fc6a9.py)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Implement_Gradient_descent_Exercise\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Comparing updates\n",
    "\n",
    "These plots compare the effectiveness of updating random directions for the problem of optimizing the parameters of a high-dimensional linear model. We contrast the behavior at initialization and during an intermediate stage of training by showing the histograms of change in loss over 100 different random directions vs the change in loss induced by the gradient descent update\n",
    "\n",
    "**Remember:** Since we are trying to minimize here, the more negative the better!\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " _Run this cell_ to visualize the results\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @markdown _Run this cell_ to visualize the results\n",
    "fig, axs = plt.subplots(1, 2, figsize=(10, 4))\n",
    "\n",
    "for id, (model_name, my_model) in enumerate([('Initialization', model),\n",
    "                                              ('Partially trained', partial_trained_model)]):\n",
    "  # Compute the loss we will be comparing to\n",
    "  base_loss = loss_fn(my_model(X), y)\n",
    "\n",
    "  # Compute the improvement via gradient descent\n",
    "  dummy_model = copy.deepcopy(my_model)\n",
    "  loss1 = loss_fn(dummy_model(X), y)\n",
    "  gradient_update(loss1, list(dummy_model.parameters()), lr=1e-2)\n",
    "  gd_delta = loss_fn(dummy_model(X), y) - base_loss\n",
    "\n",
    "  deltas = []\n",
    "  for trial_id in range(100):\n",
    "    # Compute the improvement obtained with a random direction\n",
    "    dummy_model = copy.deepcopy(my_model)\n",
    "    random_update(dummy_model, noise_scale=1e-2)\n",
    "    deltas.append((loss_fn(dummy_model(X), y) - base_loss).item())\n",
    "\n",
    "  # Plot histogram for random direction and vertical line for gradient descent\n",
    "  axs[id].hist(deltas, label='Random Directions', bins=20)\n",
    "  axs[id].set_title(model_name)\n",
    "  axs[id].set_xlabel('Change in loss')\n",
    "  axs[id].set_ylabel('% samples')\n",
    "  axs[id].axvline(0, c='green', alpha=0.5)\n",
    "  axs[id].axvline(gd_delta.item(), linestyle='--', c='red', alpha=1,\n",
    "                  label='Gradient Descent')\n",
    "\n",
    "\n",
    "handles, labels = axs[id].get_legend_handles_labels()\n",
    "fig.legend(handles, labels, loc='upper center',\n",
    "           bbox_to_anchor=(0.5, 1.05),\n",
    "           fancybox=False, shadow=False, ncol=2)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Think! 3: Gradient descent vs. random search\n",
    "\n",
    "Compare the behavior of gradient descent and random search based on the histograms above. Is any of the two methods more reliable? How can you explain the changes between behavior of the methods at initialization vs during training?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "execution": {}
   },
   "source": [
    "[*Click for solution*](https://github.com/NeuromatchAcademy/course-content-dl/tree/main/tutorials/W1D5_Optimization/solutions/W1D5_Tutorial1_Solution_c2013acf.py)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Gradient_descent_vs_random_search_Discussion\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "---\n",
    "# Section 4: Poor conditioning\n",
    "\n",
    "*Time estimate: ~30 mins*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "Already in this 'simple' logistic regression problem, the issue of bad conditioning is haunting us. Not all parameters are created equal and the sensitivity of the network to changes on the parameters will have a big impact in the dynamics of the optimization.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Video 4: Momentum\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "remove-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Video 4: Momentum\n",
    "from ipywidgets import widgets\n",
    "from IPython.display import YouTubeVideo\n",
    "from IPython.display import IFrame\n",
    "from IPython.display import display\n",
    "\n",
    "\n",
    "class PlayVideo(IFrame):\n",
    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
    "    self.id = id\n",
    "    if source == 'Bilibili':\n",
    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
    "    elif source == 'Osf':\n",
    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
    "\n",
    "\n",
    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
    "  tab_contents = []\n",
    "  for i, video_id in enumerate(video_ids):\n",
    "    out = widgets.Output()\n",
    "    with out:\n",
    "      if video_ids[i][0] == 'Youtube':\n",
    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
    "                             height=H, fs=fs, rel=0)\n",
    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
    "      else:\n",
    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
    "                          height=H, fs=fs, autoplay=False)\n",
    "        if video_ids[i][0] == 'Bilibili':\n",
    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
    "        elif video_ids[i][0] == 'Osf':\n",
    "          print(f'Video available at https://osf.io/{video.id}')\n",
    "      display(video)\n",
    "    tab_contents.append(out)\n",
    "  return tab_contents\n",
    "\n",
    "\n",
    "video_ids = [('Youtube', '3ES5O58Y_2M'), ('Bilibili', 'BV1NL411H71t')]\n",
    "tab_contents = display_videos(video_ids, W=730, H=410)\n",
    "tabs = widgets.Tab()\n",
    "tabs.children = tab_contents\n",
    "for i in range(len(tab_contents)):\n",
    "  tabs.set_title(i, video_ids[i][0])\n",
    "display(tabs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Momentum_Video\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "We illustrate this issue in a 2-dimensional setting. We freeze all but two parameters of the network: one of them is an element of the weight matrix (filter) for class 0, while the other is the bias for class 7. These results in an optimization with two decision variables.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "### Think 4!: How momentum works?\n",
    "\n",
    "How much difference is there in the behavior of these two parameters under gradient descent? What is the effect of momentum in bridging that gap?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {}
   },
   "outputs": [],
   "source": [
    "# to remove solution\n",
    "\"\"\"\n",
    "The landscapes of the two parameters appear to be\n",
    "flatter under gradient descent as can be seen in interactive demo 4 below.\n",
    "\n",
    "As randomly-initialised models exhibit chaos, we use the Newton's approach\n",
    "by tweaking the learning rate i.e., taking smaller steps in the indicated\n",
    "direction and recomputing gradients to find an optimal solution on a\n",
    "varied surface. Momentum helps reduce the chaos by maintaining a consistent\n",
    "direction for exploration (linear combination of the previous heading vector,\n",
    "and the newly-computed gradient vector).\n",
    "\"\"\";"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "####  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_How_Momentum_works_Discussion\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Coding Exercise 4: Implement momentum\n",
    "\n",
    "In this exercise you will implement the momentum update given by:\n",
    "\n",
    "\\begin{equation}\n",
    "w_{t+1} = w_t - \\eta \\nabla J(w_t) + \\beta (w_t - w_{t-1})\n",
    "\\end{equation}\n",
    "\n",
    "It is convenient to re-express this update rule in terms of a recursion. For that, we define 'velocity' as the quantity:\n",
    "\\begin{equation}\n",
    "v_{t-1} := w_{t} - w_{t-1}\n",
    "\\end{equation}\n",
    "\n",
    "which leads to the two-step update rule:\n",
    "\n",
    "\\begin{equation}\n",
    "v_t = - \\eta \\nabla J(w_t) + \\beta (\\underbrace{w_t - w_{t-1}}_{v_{t-1}})\n",
    "\\end{equation}\n",
    "\n",
    "\\begin{equation}\n",
    "w_{t+1} \\leftarrow w_t + v_{t}\n",
    "\\end{equation}\n",
    "\n",
    "Pay attention to the positive sign of the update in the last equation, given the definition of $v_t$, above."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Run this cell to setup some helper functions!\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Run this cell to setup some helper functions!\n",
    "\n",
    "def loss_2d(model, u, v, mask_idx=(0, 378), bias_id=7):\n",
    "  \"\"\"\n",
    "  Defines a 2-dim function by freezing all\n",
    "  but two parameters of a linear model.\n",
    "\n",
    "  Args:\n",
    "    model: nn.Module\n",
    "      a pytorch linear model\n",
    "    u: Scalar\n",
    "      first free parameter\n",
    "    u: Scalar\n",
    "      second free parameter\n",
    "    mask_idx: Tuple\n",
    "      selects parameter in weight matrix replaced by u\n",
    "    bias_idx: Integer\n",
    "      selects parameter in bias vector replaced by v\n",
    "\n",
    "  Returns:\n",
    "    loss: Scalar\n",
    "      loss of the 'new' model\n",
    "      over inputs X, y (defined externally)\n",
    "  \"\"\"\n",
    "\n",
    "  # We zero out the element of the weight tensor that will be\n",
    "  # replaced by u\n",
    "  mask = torch.ones_like(model.main[0].weight)\n",
    "  mask[mask_idx[0], mask_idx[1]] = 0.\n",
    "  masked_weights = model.main[0].weight * mask\n",
    "\n",
    "  # u is replacing an element of the weight matrix\n",
    "  masked_weights[mask_idx[0], mask_idx[1]] = u\n",
    "\n",
    "  res = X.reshape(-1, 784) @ masked_weights.T + model.main[0].bias\n",
    "\n",
    "  # v is replacing a bias for class 7\n",
    "  res[:, 7] += v - model.main[0].bias[7]\n",
    "  res =  F.log_softmax(res, dim=1)\n",
    "\n",
    "  return loss_fn(res, y)\n",
    "\n",
    "\n",
    "def plot_surface(U, V, Z, fig):\n",
    "  \"\"\"\n",
    "  Plot a 3D loss surface given\n",
    "  meshed inputs U, V and values Z\n",
    "\n",
    "  Args:\n",
    "    U: nd.array()\n",
    "      Input to plot for obtaining 3D loss surface\n",
    "    V: nd.array()\n",
    "      Input to plot for obtaining 3D loss surface\n",
    "    Z: nd.array()\n",
    "      Input to plot for obtaining 3D loss surface\n",
    "    fig: matplotlib.figure.Figure instance\n",
    "      Helps create a new figure, or activate an existing figure.\n",
    "\n",
    "  Returns:\n",
    "    ax: matplotlib.axes._subplots.AxesSubplot instance\n",
    "      Plotted subplot data\n",
    "  \"\"\"\n",
    "  ax = fig.add_subplot(1, 2, 2, projection='3d')\n",
    "  ax.view_init(45, -130)\n",
    "\n",
    "  surf = ax.plot_surface(U, V, Z, cmap=plt.cm.coolwarm,\n",
    "                      linewidth=0, antialiased=True, alpha=0.5)\n",
    "\n",
    "  # Select certain level contours to plot\n",
    "  # levels = Z.min() * np.array([1.005, 1.1, 1.3, 1.5, 2.])\n",
    "  # plt.contour(U, V, Z)# levels=levels, alpha=0.5)\n",
    "\n",
    "  ax.set_xlabel('Weight')\n",
    "  ax.set_ylabel('Bias')\n",
    "  ax.set_zlabel('Loss', rotation=90)\n",
    "\n",
    "  return ax\n",
    "\n",
    "\n",
    "def plot_param_distance(best_u, best_v, trajs, fig, styles, labels,\n",
    "                        use_log=False, y_min_v=-12.0, y_max_v=1.5):\n",
    "  \"\"\"\n",
    "  Plot the distance to each of the\n",
    "  two parameters for a collection of 'trajectories'\n",
    "\n",
    "  Args:\n",
    "    best_u: float\n",
    "      Optimal distance of vector u within trajectory\n",
    "    best_v: float\n",
    "      Optimal distance of vector v within trajectory\n",
    "    trajs: Tensor\n",
    "      Specifies trajectories\n",
    "    fig: matplotlib.figure.Figure instance\n",
    "      Helps create a new figure, or activate an existing figure.\n",
    "    styles: Tensor\n",
    "      Specifying Style requirements\n",
    "    use_log: Bool\n",
    "      Specifies if log distance should be calculated; else, absolute distance\n",
    "    y_min_v: float\n",
    "      Minimum distance from y to v\n",
    "    y_max_v: float\n",
    "      Maximum distance from y to v\n",
    "\n",
    "  Returns:\n",
    "    ax: matplotlib.axes._subplots.AxesSubplot instance\n",
    "      Plotted subplot data\n",
    "  \"\"\"\n",
    "  ax = fig.add_subplot(1, 1, 1)\n",
    "\n",
    "  for traj, style, label in zip(trajs, styles, labels):\n",
    "    d0 = np.array([np.abs(_[0] - best_u) for _ in traj])\n",
    "    d1 = np.array([np.abs(_[1] - best_v) for _ in traj])\n",
    "    if use_log:\n",
    "      d0 = np.log(1e-16 + d0)\n",
    "      d1 = np.log(1e-16 + d1)\n",
    "    ax.plot(range(len(traj)), d0, style, label='weight - ' + label)\n",
    "    ax.plot(range(len(traj)), d1, style, label='bias - ' + label)\n",
    "  ax.set_xlabel('Iteration')\n",
    "  if use_log:\n",
    "    ax.set_ylabel('Log distance to optimum (per dimension)')\n",
    "    ax.set_ylim(y_min_v, y_max_v)\n",
    "  else:\n",
    "    ax.set_ylabel('Abs distance to optimum (per dimension)')\n",
    "  ax.legend(loc='right', bbox_to_anchor=(1.5, 0.5),\n",
    "            fancybox=False, shadow=False, ncol=1)\n",
    "\n",
    "  return ax\n",
    "\n",
    "\n",
    "def run_optimizer(inits, eval_fn, update_fn, max_steps=500,\n",
    "                  optim_kwargs={'lr':1e-2}, log_traj=True):\n",
    "  \"\"\"\n",
    "  Runs an optimizer on a given\n",
    "  objective and logs parameter trajectory\n",
    "\n",
    "  Args:\n",
    "      inits list: Scalar\n",
    "        initialization of parameters\n",
    "      eval_fn: Callable\n",
    "        function computing the objective to be minimized\n",
    "      update_fn: Callable\n",
    "        function executing parameter update\n",
    "      max_steps: Integer\n",
    "        number of iterations to run\n",
    "      optim_kwargs: Dictionary\n",
    "        customizable dictionary containing appropriate hyperparameters for the chosen optimizer\n",
    "      log_traj: Bool\n",
    "        Specifies if log distance should be calculated; else, absolute distance\n",
    "\n",
    "  Returns:\n",
    "      list: List\n",
    "        trajectory information [*params, loss] for each optimization step\n",
    "  \"\"\"\n",
    "\n",
    "  # Initialize parameters and optimizer\n",
    "  params = [nn.Parameter(torch.tensor(_)) for _ in inits]\n",
    "  # Methods like momentum and rmsprop keep and auxiliary vector of parameters\n",
    "  aux_tensors = [torch.zeros_like(_) for _ in params]\n",
    "  if log_traj:\n",
    "    traj = np.zeros((max_steps, len(params)+1))\n",
    "  for _ in range(max_steps):\n",
    "    # Evaluate loss\n",
    "    loss = eval_fn(*params)\n",
    "    # Store 'trajectory' information\n",
    "    if log_traj:\n",
    "      traj[_, :] = [_.item() for _ in params] + [loss.item()]\n",
    "    # Perform update\n",
    "    if update_fn == gradient_update:\n",
    "      gradient_update(loss, params, **optim_kwargs)\n",
    "    else:\n",
    "      update_fn(loss, params, aux_tensors, **optim_kwargs)\n",
    "  if log_traj:\n",
    "    return traj\n",
    "\n",
    "\n",
    "L = 4.\n",
    "xs = np.linspace(-L, L, 30)\n",
    "ys = np.linspace(-L, L, 30)\n",
    "U, V = np.meshgrid(xs, ys)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {}
   },
   "outputs": [],
   "source": [
    "def momentum_update(loss, params, grad_vel, lr=1e-3, beta=0.8):\n",
    "  \"\"\"\n",
    "  Perform a momentum update over a collection of parameters given a loss and velocities\n",
    "\n",
    "  Args:\n",
    "    loss: Tensor\n",
    "      A scalar tensor containing the loss through which gradient will be computed\n",
    "    params: Iterable\n",
    "      Collection of parameters with respect to which we compute gradients\n",
    "    grad_vel: Iterable\n",
    "      Collection containing the 'velocity' v_t for each parameter\n",
    "    lr: Float\n",
    "      Scalar specifying the learning rate or step-size for the update\n",
    "    beta: Float\n",
    "      Scalar 'momentum' parameter\n",
    "\n",
    "  Returns:\n",
    "    Nothing\n",
    "  \"\"\"\n",
    "  # Clear up gradients as Pytorch automatically accumulates gradients from\n",
    "  # successive backward calls\n",
    "  zero_grad(params)\n",
    "  # Compute gradients on given objective\n",
    "  loss.backward()\n",
    "\n",
    "  with torch.no_grad():\n",
    "    for (par, vel) in zip(params, grad_vel):\n",
    "      #################################################\n",
    "      ## TODO for students: update the value of the parameter ##\n",
    "      raise NotImplementedError(\"Student exercise: implement momentum update\")\n",
    "      #################################################\n",
    "      # Update 'velocity'\n",
    "      vel.data = ...\n",
    "      # Update parameters\n",
    "      par.data += ...\n",
    "\n",
    "\n",
    "set_seed(seed=SEED)\n",
    "model2 = MLP(in_dim=784, out_dim=10, hidden_dims=[])\n",
    "print('\\n The model2 parameters before the update are: \\n')\n",
    "print_params(model2)\n",
    "loss = loss_fn(model2(X), y)\n",
    "initial_vel = [torch.randn_like(p) for p in model2.parameters()]\n",
    "\n",
    "## Uncomment below to test your function\n",
    "# momentum_update(loss, list(model2.parameters()), grad_vel=initial_vel, lr=1e-1, beta=0.9)\n",
    "# print('\\n The model2 parameters after the update are: \\n')\n",
    "# print_params(model2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "```\n",
    " The model2 parameters after the update are:\n",
    "\n",
    "main.0.weight tensor([[ 1.5898,  0.0116, -2.0239,  ..., -1.0871,  0.4030, -0.9577],\n",
    "        [ 0.4653,  0.6022, -0.7363,  ...,  0.5485, -0.2747, -0.6539],\n",
    "        [-1.4117, -1.1045,  0.6492,  ..., -1.0201,  0.6503,  0.1310],\n",
    "        ...,\n",
    "        [-0.5098,  0.5075, -0.0718,  ...,  1.1192,  0.2900, -0.9657],\n",
    "        [-0.4405, -0.1174,  0.7542,  ...,  0.0792, -0.1857,  0.3537],\n",
    "        [-1.0824,  1.0080, -0.4254,  ..., -0.3760, -1.7491,  0.6025]])\n",
    "main.0.bias tensor([ 0.4147, -1.0440,  0.8720, -1.6201, -0.9632,  0.9430, -0.5180,  1.3417,\n",
    "         0.6574,  0.3677])\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "execution": {}
   },
   "source": [
    "[*Click for solution*](https://github.com/NeuromatchAcademy/course-content-dl/tree/main/tutorials/W1D5_Optimization/solutions/W1D5_Tutorial1_Solution_3fe0e5cf.py)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Implement_momentum_Exercise\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Interactive Demo 4: Momentum vs. GD\n",
    "\n",
    "The plots below show the distance to the optimum for both variables across the two methods, as well as the parameter trajectory over the loss surface.\n",
    "\n",
    "Tune the learning rate and momentum parameters to achieve a loss below $10^{-6}$ (for both dimensions) within 100 iterations."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Run this cell to enable the widget!\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @markdown Run this cell to enable the widget!\n",
    "from matplotlib.lines import Line2D\n",
    "\n",
    "def run_newton(func, init_list=[0., 0.], max_iter=200):\n",
    "  \"\"\"\n",
    "  Find the optimum of this 2D problem using Newton's method\n",
    "\n",
    "  Args:\n",
    "    func: Callable\n",
    "      Initialising parameter tensor updates\n",
    "    init_list: Scalar\n",
    "      initialization of parameters\n",
    "    max_iter: Integer\n",
    "      The maximum number of iterations to complete\n",
    "\n",
    "  Returns:\n",
    "    par_tensor.data.numpy(): ndarray\n",
    "      List of newton's updates\n",
    "  \"\"\"\n",
    "\n",
    "  par_tensor = torch.tensor(init_list, requires_grad=True)\n",
    "  t_g = lambda par_tensor: func(par_tensor[0], par_tensor[1])\n",
    "\n",
    "  for _ in tqdm(range(max_iter)):\n",
    "    eval_loss = t_g(par_tensor)\n",
    "    eval_grad = torch.autograd.grad(eval_loss, [par_tensor])[0]\n",
    "    eval_hess = torch.autograd.functional.hessian(t_g, par_tensor)\n",
    "    # Newton's update is:  - inverse(Hessian) x gradient\n",
    "    par_tensor.data -= torch.inverse(eval_hess) @ eval_grad\n",
    "\n",
    "  return par_tensor.data.numpy()\n",
    "\n",
    "\n",
    "set_seed(2021)\n",
    "model = MLP(in_dim=784, out_dim=10, hidden_dims=[])\n",
    "# Define 2d loss objectives and surface values\n",
    "g = lambda u, v: loss_2d(copy.deepcopy(model), u, v)\n",
    "Z = np.fromiter(map(g, U.ravel(), V.ravel()), U.dtype).reshape(V.shape)\n",
    "\n",
    "best_u, best_v  = run_newton(func=g)\n",
    "\n",
    "# Initialization of the variables\n",
    "INITS = [2.5, 3.7]\n",
    "\n",
    "# Used for plotting\n",
    "LABELS = ['GD', 'Momentum']\n",
    "COLORS = ['black', 'red']\n",
    "LSTYLES = ['-', '--']\n",
    "\n",
    "\n",
    "@widgets.interact_manual\n",
    "def momentum_experiment(max_steps=widgets.IntSlider(300, 50, 500, 5),\n",
    "                        lr=widgets.FloatLogSlider(value=1e-1, min=-3, max=0.7, step=0.1),\n",
    "                        beta=widgets.FloatSlider(value=9e-1, min=0, max=1., step=0.01)\n",
    "                        ):\n",
    "  \"\"\"\n",
    "  Displays the momentum experiment as a widget\n",
    "\n",
    "  Args:\n",
    "    max_steps: widget integer slider\n",
    "      Maximum number of steps on the slider with default = 300\n",
    "    lr: widget float slider\n",
    "      Scalar specifying the learning rate or step-size for the update with default = 1e-1\n",
    "    beta: widget float slider\n",
    "      Scalar 'momentum' parameter with default = 9e-1\n",
    "\n",
    "  Returns:\n",
    "    Nothing\n",
    "  \"\"\"\n",
    "  # Execute both optimizers\n",
    "  sgd_traj = run_optimizer(INITS, eval_fn=g, update_fn=gradient_update,\n",
    "                           max_steps=max_steps, optim_kwargs={'lr': lr})\n",
    "  mom_traj = run_optimizer(INITS, eval_fn=g, update_fn=momentum_update,\n",
    "                           max_steps=max_steps, optim_kwargs={'lr': lr, 'beta':beta})\n",
    "\n",
    "  TRAJS = [sgd_traj, mom_traj]\n",
    "\n",
    "  # Plot distances\n",
    "  fig = plt.figure(figsize=(9,4))\n",
    "  plot_param_distance(best_u, best_v, TRAJS, fig,\n",
    "                      LSTYLES, LABELS, use_log=True, y_min_v=-12.0, y_max_v=1.5)\n",
    "\n",
    "  # # Plot trajectories\n",
    "  fig = plt.figure(figsize=(12, 5))\n",
    "  ax = plot_surface(U, V, Z, fig)\n",
    "  for traj, c, label in zip(TRAJS, COLORS, LABELS):\n",
    "    ax.plot3D(*traj.T, c, linewidth=0.3, label=label)\n",
    "    ax.scatter3D(*traj.T, '.-', s=1, c=c)\n",
    "\n",
    "  # Plot optimum point\n",
    "  ax.scatter(best_u, best_v, Z.min(), marker='*', s=80, c='lime', label='Opt.');\n",
    "  lines = [Line2D([0], [0],\n",
    "                  color=c,\n",
    "                  linewidth=3,\n",
    "                  linestyle='--') for c in COLORS]\n",
    "  lines.append(Line2D([0], [0], color='lime', linewidth=0, marker='*'))\n",
    "  ax.legend(lines, LABELS + ['Optimum'], loc='right',\n",
    "            bbox_to_anchor=(.8, -0.1), ncol=len(LABELS) + 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Momentum_vs_GD_Interactive_Demo\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Think! 4: Momentum and oscillations\n",
    "\n",
    "- Discuss how this specific example illustrates the issue of poor conditioning in optimization? How does momentum help resolve these difficulties?\n",
    "\n",
    "- Do you see oscillations for any of these methods? Why does this happen?\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "execution": {}
   },
   "source": [
    "[*Click for solution*](https://github.com/NeuromatchAcademy/course-content-dl/tree/main/tutorials/W1D5_Optimization/solutions/W1D5_Tutorial1_Solution_5eaa9306.py)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Momentum_and_oscillations_Discussion\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "---\n",
    "# Section 5: Non-convexity\n",
    "\n",
    "*Time estimate: ~30 mins*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "The introduction of even just 1 hidden layer in the neural network transforms the previous convex optimization problem into a non-convex one. And with great non-convexity, comes great responsibility... (Sorry, we couldn't help it!)\n",
    "\n",
    "**Note:** From this section onwards we will be dealing with non-convex optimization problems for the remainder of the tutorial."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Video 5: Overparameterization\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "remove-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Video 5: Overparameterization\n",
    "from ipywidgets import widgets\n",
    "from IPython.display import YouTubeVideo\n",
    "from IPython.display import IFrame\n",
    "from IPython.display import display\n",
    "\n",
    "\n",
    "class PlayVideo(IFrame):\n",
    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
    "    self.id = id\n",
    "    if source == 'Bilibili':\n",
    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
    "    elif source == 'Osf':\n",
    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
    "\n",
    "\n",
    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
    "  tab_contents = []\n",
    "  for i, video_id in enumerate(video_ids):\n",
    "    out = widgets.Output()\n",
    "    with out:\n",
    "      if video_ids[i][0] == 'Youtube':\n",
    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
    "                             height=H, fs=fs, rel=0)\n",
    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
    "      else:\n",
    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
    "                          height=H, fs=fs, autoplay=False)\n",
    "        if video_ids[i][0] == 'Bilibili':\n",
    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
    "        elif video_ids[i][0] == 'Osf':\n",
    "          print(f'Video available at https://osf.io/{video.id}')\n",
    "      display(video)\n",
    "    tab_contents.append(out)\n",
    "  return tab_contents\n",
    "\n",
    "\n",
    "video_ids = [('Youtube', '7vUpUEKKl5o'), ('Bilibili', 'BV16h41167Jr')]\n",
    "tab_contents = display_videos(video_ids, W=730, H=410)\n",
    "tabs = widgets.Tab()\n",
    "tabs.children = tab_contents\n",
    "for i in range(len(tab_contents)):\n",
    "  tabs.set_title(i, video_ids[i][0])\n",
    "display(tabs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Overparameterization_Video\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "Take a couple of minutes to play with a more complex 3D visualization of the loss landscape of a neural network on a non-convex problem. Visit https://losslandscape.com/explorer.\n",
    "\n",
    "1. Explore the features on the bottom left corner. You can see an explanation for each icon by clicking on the ( i ) button located on the top right corner.\n",
    "2. Use the 'gradient descent' feature to perform a thought experiment:\n",
    "    -   Choose an initialization\n",
    "    -   Choose the learning rate\n",
    "    -   Mentally formulate your hypothesis about what kind of trajectory you expect to observe\n",
    "3. Run the experiment and contrast your intuition with the observed behavior.\n",
    "4. Repeat this experiment a handful of times for several initialization/learning rate configurations\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Interactive Demo 5: Overparameterization to the rescue!\n",
    "\n",
    "As you may have seen, the non-convex nature of the surface can lead the optimization process to get stuck in undesirable local-optima. There is ample empirical evidence supporting the claim that 'overparameterized' models are easier to train.\n",
    "\n",
    "We will explore this assertion in the context of our MLP training. For this, we initialize a fixed model and construct several models by small random perturbations to the original initialized weights. Now, we train each of these perturbed models and see how the loss evolves. If we were in the convex setting, we should reach very similar objective values upon convergence since all these models were very close at the beginning of training, and in convex problems, the local optimum is also the global optimum.\n",
    "\n",
    "Use the interactive plot below to visualize the loss progression for these perturbed models:\n",
    "\n",
    "1. Select different settings from the `hidden_dims` drop-down menu.\n",
    "2. Explore the effect of the number of steps and learning rate."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Execute this cell to enable the widget!\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @markdown Execute this cell to enable the widget!\n",
    "\n",
    "@widgets.interact_manual\n",
    "def overparam(max_steps=widgets.IntSlider(150, 50, 500, 5),\n",
    "              hidden_dims=widgets.Dropdown(options=[\"10\", \"20, 20\", \"100, 100\"],\n",
    "                                           value=\"10\"),\n",
    "              lr=widgets.FloatLogSlider(value=5e-2, min=-3, max=0, step=0.1),\n",
    "              num_inits=widgets.IntSlider(7, 5, 10, 1)):\n",
    "  \"\"\"\n",
    "  Displays the overparameterization phenomenon as a widget\n",
    "\n",
    "  Args:\n",
    "    max_steps: widget integer slider\n",
    "      Maximum number of steps on the slider with default = 150\n",
    "    hidden_dims: widget dropdown menu instance\n",
    "      The number of hidden dimensions with default = 10\n",
    "    lr: widget float slider\n",
    "      Scalar specifying the learning rate or step-size for the update with default = 5e-2\n",
    "    num_inits: widget integer slider\n",
    "      Scalar number of epochs\n",
    "\n",
    "  Returns:\n",
    "    Nothing\n",
    "  \"\"\"\n",
    "\n",
    "  X, y = train_set.data[subset_index, :], train_set.targets[subset_index]\n",
    "\n",
    "  hdims = [int(s) for s in hidden_dims.split(',')]\n",
    "  base_model = MLP(in_dim=784, out_dim=10, hidden_dims=hdims)\n",
    "\n",
    "  fig, axs = plt.subplots(1, 1, figsize=(5, 4))\n",
    "\n",
    "  for _ in tqdm(range(num_inits)):\n",
    "    model = copy.deepcopy(base_model)\n",
    "    random_update(model, noise_scale=2e-1)\n",
    "    loss_hist = np.zeros((max_steps, 2))\n",
    "    for step in range(max_steps):\n",
    "      loss = loss_fn(model(X), y)\n",
    "      gradient_update(loss, list(model.parameters()), lr=lr)\n",
    "      loss_hist[step] = np.array([step, loss.item()])\n",
    "\n",
    "    plt.plot(loss_hist[:, 0], loss_hist[:, 1])\n",
    "\n",
    "  plt.xlabel('Iteration')\n",
    "  plt.ylabel('Loss')\n",
    "  plt.ylim(0, 3)\n",
    "  plt.show()\n",
    "\n",
    "  num_params = sum([np.prod(_.shape) for _ in model.parameters()])\n",
    "  print('Number of parameters in model:  ' + str(num_params))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Overparameterization_Interactive_Demo\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "### Think! 5.1: Width and depth of the network\n",
    "\n",
    "- We see that as we increase the width/depth of the network, training becomes faster and more consistent across different initializations. What might be the reasons for this behavior?\n",
    "\n",
    "- What are some potential downsides of this approach to dealing with non-convexity?\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "execution": {}
   },
   "source": [
    "[*Click for solution*](https://github.com/NeuromatchAcademy/course-content-dl/tree/main/tutorials/W1D5_Optimization/solutions/W1D5_Tutorial1_Solution_d69ca8d7.py)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "####  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Width_and_depth_of_the_network_Discussion\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "---\n",
    "# Section 6: Full gradients are expensive\n",
    "\n",
    "*Time estimate: ~25 mins*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "So far we have used only a small (fixed) subset of 500 training examples to perform the updates on the model parameters in our quest to minimize the loss. But what if we decided to use the training set? Do our current approach scale to datasets with tens of thousands, or millions of datapoints?\n",
    "\n",
    "In this section we explore an efficient alternative to avoid having to perform computations on all the training examples before performing a parameter update."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Video 6: Mini-batches\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "remove-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Video 6: Mini-batches\n",
    "from ipywidgets import widgets\n",
    "from IPython.display import YouTubeVideo\n",
    "from IPython.display import IFrame\n",
    "from IPython.display import display\n",
    "\n",
    "\n",
    "class PlayVideo(IFrame):\n",
    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
    "    self.id = id\n",
    "    if source == 'Bilibili':\n",
    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
    "    elif source == 'Osf':\n",
    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
    "\n",
    "\n",
    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
    "  tab_contents = []\n",
    "  for i, video_id in enumerate(video_ids):\n",
    "    out = widgets.Output()\n",
    "    with out:\n",
    "      if video_ids[i][0] == 'Youtube':\n",
    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
    "                             height=H, fs=fs, rel=0)\n",
    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
    "      else:\n",
    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
    "                          height=H, fs=fs, autoplay=False)\n",
    "        if video_ids[i][0] == 'Bilibili':\n",
    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
    "        elif video_ids[i][0] == 'Osf':\n",
    "          print(f'Video available at https://osf.io/{video.id}')\n",
    "      display(video)\n",
    "    tab_contents.append(out)\n",
    "  return tab_contents\n",
    "\n",
    "\n",
    "video_ids = [('Youtube', 'hbqUxpNBUGk'), ('Bilibili', 'BV1ty4y1T7Uh')]\n",
    "tab_contents = display_videos(video_ids, W=730, H=410)\n",
    "tabs = widgets.Tab()\n",
    "tabs.children = tab_contents\n",
    "for i in range(len(tab_contents)):\n",
    "  tabs.set_title(i, video_ids[i][0])\n",
    "display(tabs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Mini_batches_Video\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Interactive Demo 6.1: Cost of computation\n",
    "\n",
    "Evaluating a neural network is a relatively fast process. However, when repeated millions of times, the computational cost of performing forward and backward passes through the network starts to become significant.\n",
    "\n",
    "In the visualization below, we show the time (averaged over 5 runs) of computing a forward and backward pass with a changing number of input examples. Choose from the different options in the drop-down box and note how the vertical scale changes depending on the size of the network.\n",
    "\n",
    "**Remarks:** Note that the computational cost of a forward pass shows a clear linear relationship with the number of input examples, and the cost of the corresponding backward pass exhibits a similar computational complexity."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Execute this cell to enable the widget!\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @markdown Execute this cell to enable the widget!\n",
    "\n",
    "def gradient_update(loss, params, lr=1e-3):\n",
    "  \"\"\"\n",
    "  Perform a gradient descent update on a given loss over a collection of parameters\n",
    "\n",
    "  Args:\n",
    "    loss: Tensor\n",
    "      A scalar tensor containing the loss through which the gradient will be computed\n",
    "    params: List of iterables\n",
    "      Collection of parameters with respect to which we compute gradients\n",
    "    lr: Float\n",
    "      Scalar specifying the learning rate or step-size for the update\n",
    "\n",
    "  Returns:\n",
    "    Nothing\n",
    "  \"\"\"\n",
    "  # Clear up gradients as Pytorch automatically accumulates gradients from\n",
    "  # successive backward calls\n",
    "  zero_grad(params)\n",
    "\n",
    "  # Compute gradients on given objective\n",
    "  loss.backward()\n",
    "\n",
    "  with torch.no_grad():\n",
    "    for par in params:\n",
    "       par.data -= lr * par.grad.data\n",
    "\n",
    "\n",
    "def measure_update_time(model, num_points):\n",
    "  \"\"\"\n",
    "  Measuring the time for update\n",
    "\n",
    "  Args:\n",
    "    model: an nn.Module inherited model\n",
    "      Represents the ML/DL model\n",
    "    num_points: integer\n",
    "      The number of data points in the train_set\n",
    "\n",
    "  Returns:\n",
    "    tuple of loss time and time for calculation of gradient\n",
    "  \"\"\"\n",
    "  X, y = train_set.data[:num_points], train_set.targets[:num_points]\n",
    "  start_time = time.time()\n",
    "  loss = loss_fn(model(X), y)\n",
    "  loss_time = time.time()\n",
    "  gradient_update(loss, list(model.parameters()), lr=0)\n",
    "  gradient_time = time.time()\n",
    "  return loss_time - start_time, gradient_time - loss_time\n",
    "\n",
    "\n",
    "@widgets.interact\n",
    "def computation_time(hidden_dims=widgets.Dropdown(options=[\"1\", \"100\", \"50, 50\"],\n",
    "                                                  value=\"100\")):\n",
    "  \"\"\"\n",
    "  Demonstrating time taken for computation as a widget\n",
    "\n",
    "  Args:\n",
    "    hidden_dims: widgets dropdown\n",
    "      The number of hidden dimensions with default = 100\n",
    "\n",
    "  Returns:\n",
    "    Nothing\n",
    "  \"\"\"\n",
    "  hdims = [int(s) for s in hidden_dims.split(',')]\n",
    "  model = MLP(in_dim=784, out_dim=10, hidden_dims=hdims)\n",
    "\n",
    "  NUM_POINTS = [1, 5, 10, 100, 200, 500, 1000, 5000, 10000, 20000, 30000, 50000]\n",
    "  times_list = []\n",
    "  for _ in range(5):\n",
    "    times_list.append(np.array([measure_update_time(model, _) for _ in NUM_POINTS]))\n",
    "\n",
    "  times = np.array(times_list).mean(axis=0)\n",
    "\n",
    "  fig, axs = plt.subplots(1, 1, figsize=(5,4))\n",
    "  plt.plot(NUM_POINTS, times[:, 0], label='Forward')\n",
    "  plt.plot(NUM_POINTS, times[:, 1], label='Backward')\n",
    "  plt.xlabel('Number of data points')\n",
    "  plt.ylabel('Seconds')\n",
    "  plt.legend()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Cost_of_computation_Interactive_Demo\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Coding Exercise 6: Implement minibatch sampling\n",
    "\n",
    "Complete the code in `sample_minibatch` so as to produce IID subsets of the training set of the desired size. (This is _not_ a trick question.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {}
   },
   "outputs": [],
   "source": [
    "def sample_minibatch(input_data, target_data, num_points=100):\n",
    "  \"\"\"\n",
    "  Sample a minibatch of size num_point from the provided input-target data\n",
    "\n",
    "  Args:\n",
    "    input_data: Tensor\n",
    "      Multi-dimensional tensor containing the input data\n",
    "    target_data: Tensor\n",
    "      1D tensor containing the class labels\n",
    "    num_points: Integer\n",
    "      Number of elements to be included in minibatch with default=100\n",
    "\n",
    "  Returns:\n",
    "    batch_inputs: Tensor\n",
    "      Minibatch inputs\n",
    "    batch_targets: Tensor\n",
    "      Minibatch targets\n",
    "  \"\"\"\n",
    "  #################################################\n",
    "  ## TODO for students: sample minibatch of data ##\n",
    "  raise NotImplementedError(\"Student exercise: implement gradient update\")\n",
    "  #################################################\n",
    "  # Sample a collection of IID indices from the existing data\n",
    "  batch_indices = ...\n",
    "  # Use batch_indices to extract entries from the input and target data tensors\n",
    "  batch_inputs = input_data[...]\n",
    "  batch_targets = target_data[...]\n",
    "\n",
    "  return batch_inputs, batch_targets\n",
    "\n",
    "\n",
    "\n",
    "## Uncomment to test your function\n",
    "# x_batch, y_batch = sample_minibatch(X, y, num_points=100)\n",
    "# print(f\"The input shape is {x_batch.shape} and the target shape is: {y_batch.shape}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "```\n",
    "The input shape is torch.Size([100, 28, 28]) and the target shape is: torch.Size([100])\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "execution": {}
   },
   "source": [
    "[*Click for solution*](https://github.com/NeuromatchAcademy/course-content-dl/tree/main/tutorials/W1D5_Optimization/solutions/W1D5_Tutorial1_Solution_02847e9d.py)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Implement_mini_batch_sampling_Exercise\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Interactive Demo 6.2: *Compare* different minibatch sizes\n",
    "\n",
    "What are the trade-offs induced by the choice of minibatch size? The interactive plot below shows the training evolution of a 2-hidden layer MLP with 100 hidden units in each hidden layer. Different plots correspond to a different choice of minibatch size. We have a fixed time budget for all the cases, reflected in the horizontal axes of these plots."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Execute this cell to enable the widget!\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @markdown Execute this cell to enable the widget!\n",
    "\n",
    "@widgets.interact_manual\n",
    "def minibatch_experiment(batch_sizes='20, 250, 1000',\n",
    "                         lrs='5e-3, 5e-3, 5e-3',\n",
    "                         time_budget=widgets.Dropdown(options=[\"2.5\", \"5\", \"10\"],\n",
    "                                                      value=\"2.5\")):\n",
    "  \"\"\"\n",
    "  Demonstration of minibatch experiment\n",
    "\n",
    "  Args:\n",
    "    batch_sizes: String\n",
    "      Size of minibatches\n",
    "    lrs: String\n",
    "      Different learning rates\n",
    "    time_budget: widget dropdown instance\n",
    "      Different time budgets with default=2.5s\n",
    "\n",
    "  Returns:\n",
    "    Nothing\n",
    "  \"\"\"\n",
    "  batch_sizes = [int(s) for s in batch_sizes.split(',')]\n",
    "  lrs = [float(s) for s in lrs.split(',')]\n",
    "\n",
    "  LOSS_HIST = {_:[] for _ in batch_sizes}\n",
    "\n",
    "  X, y = train_set.data, train_set.targets\n",
    "  base_model = MLP(in_dim=784, out_dim=10, hidden_dims=[100, 100])\n",
    "\n",
    "  for id, batch_size in enumerate(tqdm(batch_sizes)):\n",
    "    start_time = time.time()\n",
    "    # Create a new copy of the model for each batch size\n",
    "    model = copy.deepcopy(base_model)\n",
    "    params = list(model.parameters())\n",
    "    lr = lrs[id]\n",
    "    # Fixed budget per choice of batch size\n",
    "    while (time.time() - start_time) < float(time_budget):\n",
    "      data, labels = sample_minibatch(X, y, batch_size)\n",
    "      loss = loss_fn(model(data), labels)\n",
    "      gradient_update(loss, params, lr=lr)\n",
    "      LOSS_HIST[batch_size].append([time.time() - start_time,\n",
    "                                    loss.item()])\n",
    "\n",
    "  fig, axs = plt.subplots(1, len(batch_sizes), figsize=(10, 3))\n",
    "  for ax, batch_size in zip(axs, batch_sizes):\n",
    "    plot_data = np.array(LOSS_HIST[batch_size])\n",
    "    ax.plot(plot_data[:, 0], plot_data[:, 1], label=batch_size,\n",
    "            alpha=0.8)\n",
    "    ax.set_title('Batch size: ' + str(batch_size))\n",
    "    ax.set_xlabel('Seconds')\n",
    "    ax.set_ylabel('Loss')\n",
    "  plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "**Remarks:** SGD works! We have an algorithm that can be applied (with due precautions) to learn datasets of arbitrary size.\n",
    "\n",
    "However, **note the difference in the vertical scale** across the plots above. When using a larger minibatch, we can perform fewer parameter updates as the forward and backward passes are more expensive.\n",
    "\n",
    "This highlights the interplay between the minibatch size and the learning rate: when our minibatch is larger, we have a more confident estimator of the direction to move, and thus can afford a larger learning rate. On the other hand, extremely small minibatches are very fast computationally but are not representative of the data distribution and yield estimations of the gradient with high variance.\n",
    "\n",
    "We encourage you to tune the value of the learning rate for each of the minibatch sizes in the previous demo, to achieve a training loss steadily below 0.5 within 5 seconds."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Compare_different_minibatch_sizes_Interactive_Demo\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "---\n",
    "# Section 7: Adaptive methods\n",
    "\n",
    "*Time estimate: ~25 mins*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "As of now, you should be aware that there are many knobs to turn when working on a machine learning problem. Some of these relate to the optimization algorithm, the choice of model, or the objective to minimize. Here are some prototypical examples:\n",
    "\n",
    "- Problem: loss function, regularization coefficients (Week 1, Day 5)\n",
    "- Model: architecture, activations function\n",
    "- Optimizer: learning rate, batch size, momentum coefficient\n",
    "\n",
    "We concentrate on the choices that are directly related to optimization. In particular, we will explore some _automatic_ methods for setting the learning rate in a way that fixes the poor-conditioning problem and is robust across different problems.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Video 7: Adaptive Methods\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "remove-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Video 7: Adaptive Methods\n",
    "from ipywidgets import widgets\n",
    "from IPython.display import YouTubeVideo\n",
    "from IPython.display import IFrame\n",
    "from IPython.display import display\n",
    "\n",
    "\n",
    "class PlayVideo(IFrame):\n",
    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
    "    self.id = id\n",
    "    if source == 'Bilibili':\n",
    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
    "    elif source == 'Osf':\n",
    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
    "\n",
    "\n",
    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
    "  tab_contents = []\n",
    "  for i, video_id in enumerate(video_ids):\n",
    "    out = widgets.Output()\n",
    "    with out:\n",
    "      if video_ids[i][0] == 'Youtube':\n",
    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
    "                             height=H, fs=fs, rel=0)\n",
    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
    "      else:\n",
    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
    "                          height=H, fs=fs, autoplay=False)\n",
    "        if video_ids[i][0] == 'Bilibili':\n",
    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
    "        elif video_ids[i][0] == 'Osf':\n",
    "          print(f'Video available at https://osf.io/{video.id}')\n",
    "      display(video)\n",
    "    tab_contents.append(out)\n",
    "  return tab_contents\n",
    "\n",
    "\n",
    "video_ids = [('Youtube', 'Zr6r2kfmQUM'), ('Bilibili', 'BV1eq4y1W7JG')]\n",
    "tab_contents = display_videos(video_ids, W=730, H=410)\n",
    "tabs = widgets.Tab()\n",
    "tabs.children = tab_contents\n",
    "for i in range(len(tab_contents)):\n",
    "  tabs.set_title(i, video_ids[i][0])\n",
    "display(tabs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Adaptive_Methods_Video\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Coding Exercise 7: Implement RMSprop\n",
    "\n",
    "In this exercise you will implement the update of the RMSprop optimizer:\n",
    "\n",
    "\\begin{align}\n",
    "v_{t} &= \\alpha v_{t-1} + (1 - \\alpha) \\nabla J(w_t)^2 \\\\ \\\\\n",
    "w_{t+1} &= w_t - \\eta \\frac{\\nabla J(w_t)}{\\sqrt{v_t + \\epsilon}}\n",
    "\\end{align}\n",
    "\n",
    "where the non-standard operations (the division of two vectors, squaring a vector, etc.) are to be interpreted as element-wise operations, i.e., the operation is applied to each (pair of) entry(ies) of the vector(s) considered as real number(s).\n",
    "\n",
    "Here, the $\\epsilon$ hyperparameter provides numerical stability to the algorithm by preventing the learning rate from becoming too big when $v_t$ is small. Typically, we set $\\epsilon$ to a small default value, like $10^{-8}$."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {}
   },
   "outputs": [],
   "source": [
    "def rmsprop_update(loss, params, grad_sq, lr=1e-3, alpha=0.8, epsilon=1e-8):\n",
    "  \"\"\"\n",
    "  Perform an RMSprop update on a collection of parameters\n",
    "\n",
    "  Args:\n",
    "    loss: Tensor\n",
    "      A scalar tensor containing the loss whose gradient will be computed\n",
    "    params: Iterable\n",
    "      Collection of parameters with respect to which we compute gradients\n",
    "    grad_sq: Iterable\n",
    "      Moving average of squared gradients\n",
    "    lr: Float\n",
    "      Scalar specifying the learning rate or step-size for the update\n",
    "    alpha: Float\n",
    "      Moving average parameter\n",
    "    epsilon: Float\n",
    "      quotient for numerical stability\n",
    "\n",
    "  Returns:\n",
    "    Nothing\n",
    "  \"\"\"\n",
    "  # Clear up gradients as Pytorch automatically accumulates gradients from\n",
    "  # successive backward calls\n",
    "  zero_grad(params)\n",
    "  # Compute gradients on given objective\n",
    "  loss.backward()\n",
    "\n",
    "  with torch.no_grad():\n",
    "    for (par, gsq) in zip(params, grad_sq):\n",
    "      #################################################\n",
    "      ## TODO for students: update the value of the parameter ##\n",
    "      # Use gsq.data and par.grad\n",
    "      raise NotImplementedError(\"Student exercise: implement gradient update\")\n",
    "      #################################################\n",
    "      # Update estimate of gradient variance\n",
    "      gsq.data = ...\n",
    "      # Update parameters\n",
    "      par.data -=  ...\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "set_seed(seed=SEED)\n",
    "model3 = MLP(in_dim=784, out_dim=10, hidden_dims=[])\n",
    "print('\\n The model3 parameters before the update are: \\n')\n",
    "print_params(model3)\n",
    "loss = loss_fn(model3(X), y)\n",
    "# Initialize the moving average of squared gradients\n",
    "grad_sq = [1e-6*i for i in list(model3.parameters())]\n",
    "\n",
    "\n",
    "\n",
    "## Uncomment below to test your function\n",
    "# rmsprop_update(loss, list(model3.parameters()), grad_sq=grad_sq, lr=1e-3)\n",
    "# print('\\n The model3 parameters after the update are: \\n')\n",
    "# print_params(model3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "```\n",
    " The model3 parameters after the update are:\n",
    "\n",
    "main.0.weight tensor([[-0.0240,  0.0031,  0.0193,  ...,  0.0316,  0.0297, -0.0198],\n",
    "        [-0.0063, -0.0318, -0.0109,  ..., -0.0093,  0.0232, -0.0255],\n",
    "        [ 0.0218, -0.0253,  0.0320,  ...,  0.0102,  0.0248, -0.0203],\n",
    "        ...,\n",
    "        [-0.0027,  0.0136,  0.0089,  ...,  0.0123, -0.0324, -0.0166],\n",
    "        [ 0.0159,  0.0281,  0.0233,  ..., -0.0133, -0.0197,  0.0182],\n",
    "        [ 0.0186, -0.0376, -0.0205,  ..., -0.0293,  0.0077, -0.0019]])\n",
    "main.0.bias tensor([-0.0313, -0.0011,  0.0122, -0.0342,  0.0045,  0.0199,  0.0329,  0.0265,\n",
    "         0.0182, -0.0041])\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "execution": {}
   },
   "source": [
    "[*Click for solution*](https://github.com/NeuromatchAcademy/course-content-dl/tree/main/tutorials/W1D5_Optimization/solutions/W1D5_Tutorial1_Solution_f7291fed.py)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Implement_RMSProp_Exercise\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Interactive Demo 7: Compare optimizers\n",
    "\n",
    "Below, we compare your implementations of **SGD**, **Momentum**, and **RMSprop**. If you have successfully coded all the exercises so far: congrats!\n",
    "\n",
    "You are now *in the know* of some of the most commonly used and powerful optimization tools for deep learning."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Execute this cell to enable the widget!\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @markdown Execute this cell to enable the widget!\n",
    "X, y = train_set.data, train_set.targets\n",
    "\n",
    "@widgets.interact_manual\n",
    "def compare_optimizers(\n",
    "    batch_size=(25, 250, 5),\n",
    "    lr=widgets.FloatLogSlider(value=2e-3, min=-5, max=0),\n",
    "    max_steps=(50, 500, 5)):\n",
    "  \"\"\"\n",
    "  Demonstration to compare optimisers - stochastic gradient descent, momentum, RMSprop\n",
    "\n",
    "  Args:\n",
    "    batch_size: Tuple\n",
    "      Size of minibatches\n",
    "    lr: Float log slider instance\n",
    "      Scalar specifying the learning rate or step-size for the update\n",
    "    max_steps: Tuple\n",
    "      Max number of step sizes for incrementing\n",
    "\n",
    "  Returns:\n",
    "    Nothing\n",
    "  \"\"\"\n",
    "  SGD_DICT = [gradient_update, 'SGD', 'black', '-', {'lr': lr}]\n",
    "  MOM_DICT = [momentum_update, 'Momentum', 'red', '--', {'lr': lr, 'beta': 0.9}]\n",
    "  RMS_DICT = [rmsprop_update, 'RMSprop', 'fuchsia', '-', {'lr': lr, 'alpha': 0.8}]\n",
    "\n",
    "  ALL_DICTS = [SGD_DICT, MOM_DICT, RMS_DICT]\n",
    "\n",
    "  base_model = MLP(in_dim=784, out_dim=10, hidden_dims=[100, 100])\n",
    "\n",
    "  LOSS_HIST = {}\n",
    "\n",
    "  for opt_dict in tqdm(ALL_DICTS):\n",
    "    update_fn, opt_name, color, lstyle, kwargs = opt_dict\n",
    "    LOSS_HIST[opt_name] = []\n",
    "\n",
    "    model = copy.deepcopy(base_model)\n",
    "    params = list(model.parameters())\n",
    "\n",
    "    if opt_name != 'SGD':\n",
    "      aux_tensors = [torch.zeros_like(_) for _ in params]\n",
    "\n",
    "    for step in range(max_steps):\n",
    "      data, labels = sample_minibatch(X, y, batch_size)\n",
    "      loss = loss_fn(model(data), labels)\n",
    "      if opt_name == 'SGD':\n",
    "        update_fn(loss, params, **kwargs)\n",
    "      else:\n",
    "        update_fn(loss, params, aux_tensors, **kwargs)\n",
    "      LOSS_HIST[opt_name].append(loss.item())\n",
    "\n",
    "  fig, axs = plt.subplots(1, len(ALL_DICTS), figsize=(9, 3))\n",
    "  for ax, optim_dict in zip(axs, ALL_DICTS):\n",
    "    opt_name = optim_dict[1]\n",
    "    ax.plot(range(max_steps), LOSS_HIST[opt_name], alpha=0.8)\n",
    "    ax.set_title(opt_name)\n",
    "    ax.set_xlabel('Iteration')\n",
    "    ax.set_ylabel('Loss')\n",
    "    ax.set_ylim(0, 2.5)\n",
    "  plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Compare_optimizers_Interactive_Demo\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Think 7.1!: Compare optimizers\n",
    "\n",
    "Tune the three methods above - **SGD**, **Momentum**, and **RMSProp** - to make each excel and discuss your findings. How do the methods compare in terms of robustness to small changes of the hyperparameters? How easy was it to find a good hyperparameter configuration?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "execution": {}
   },
   "source": [
    "[*Click for solution*](https://github.com/NeuromatchAcademy/course-content-dl/tree/main/tutorials/W1D5_Optimization/solutions/W1D5_Tutorial1_Solution_adc539df.py)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Compare_optimizers_Discussion\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "**Remarks:** Note that RMSprop allows us to use a 'per-dimension' learning rate _without having to tune one learning rate for each dimension **ourselves**_. The method uses information collected about the variance of the gradients throughout training to **adapt** the step size for each of the parameters automatically. The savings in tuning efforts of RMSprop over SGD or 'plain' momentum are undisputed on this task.\n",
    "\n",
    "Moreover, adaptive optimization methods are currently a highly active research domain, with many related algorithms like Adam, AMSgrad, Adagrad being used in practical application and theoretically investigated."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "### Locality of Gradients\n",
    "\n",
    "As we've seen throughout this tutorial, poor conditioning can be a significant burden on convergence to an optimum while using gradient-based optimization. Of the methods we've seen to deal with this issue, notice how both momentum and adaptive learning rates incorporate past gradient values into their update schemes. Why do we use past values of our loss function's gradient while updating our current MLP weights?\n",
    "\n",
    "Recall from *W1D2* that the gradient of a function, $\\nabla f(w_t)$, is a **local** property and computes the direction of maximum change of $f(w_t)$ at the point $w_t$. However, when we train our MLP model we are hoping to find the **global** optimum for our training loss. By incorporating past values of our function's gradient into our optimization schemes, we use more information about the overall shape of our function than just a single gradient alone can provide."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Think! 7.2: Loss function and optimization\n",
    "\n",
    "Can you think of other ways we can incorporate more information about our loss function into our optimization schemes?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "execution": {}
   },
   "source": [
    "[*Click for solution*](https://github.com/NeuromatchAcademy/course-content-dl/tree/main/tutorials/W1D5_Optimization/solutions/W1D5_Tutorial1_Solution_c7070297.py)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Loss_function_and_optimization_Discussion\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "---\n",
    "# Section 8: Ethical concerns\n",
    "\n",
    "*Time estimate: ~15mins*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Video 8: Ethical concerns\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "remove-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Video 8: Ethical concerns\n",
    "from ipywidgets import widgets\n",
    "from IPython.display import YouTubeVideo\n",
    "from IPython.display import IFrame\n",
    "from IPython.display import display\n",
    "\n",
    "\n",
    "class PlayVideo(IFrame):\n",
    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
    "    self.id = id\n",
    "    if source == 'Bilibili':\n",
    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
    "    elif source == 'Osf':\n",
    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
    "\n",
    "\n",
    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
    "  tab_contents = []\n",
    "  for i, video_id in enumerate(video_ids):\n",
    "    out = widgets.Output()\n",
    "    with out:\n",
    "      if video_ids[i][0] == 'Youtube':\n",
    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
    "                             height=H, fs=fs, rel=0)\n",
    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
    "      else:\n",
    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
    "                          height=H, fs=fs, autoplay=False)\n",
    "        if video_ids[i][0] == 'Bilibili':\n",
    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
    "        elif video_ids[i][0] == 'Osf':\n",
    "          print(f'Video available at https://osf.io/{video.id}')\n",
    "      display(video)\n",
    "    tab_contents.append(out)\n",
    "  return tab_contents\n",
    "\n",
    "\n",
    "video_ids = [('Youtube', '0EthSI0cknI'), ('Bilibili', 'BV1TU4y1G7Je')]\n",
    "tab_contents = display_videos(video_ids, W=730, H=410)\n",
    "tabs = widgets.Tab()\n",
    "tabs.children = tab_contents\n",
    "for i in range(len(tab_contents)):\n",
    "  tabs.set_title(i, video_ids[i][0])\n",
    "display(tabs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Ethical_concerns_Video\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "---\n",
    "# Summary\n",
    "\n",
    "* Optimization is necessary to create Deep Learning models that are guaranteed to converge\n",
    "* Stochastic Gradient Descent and Momentum are two commonly used optimization techniques\n",
    "* RMSProp is a way of adaptive hyperparameter tuning which utilises a per-dimension learning rate\n",
    "* Poor choice of optimization objectives can lead to unforeseen, undesirable consequences\n",
    "\n",
    "If you have time left, you can read the Bonus material, where we put it all together and we compare our model with a benchmark model."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "---\n",
    "# Daily survey\n",
    "\n",
    "Don't forget to complete your reflections and content check in the daily survey! Please be patient after logging in as there is\n",
    "a small delay before you will be redirected to the survey.\n",
    "\n",
    "<a href=\"https://portal.neuromatchacademy.org/api/redirect/to/2c5bbb85-d91a-4f5a-99fa-cefc287653d7\"><img src=\"https://github.com/NeuromatchAcademy/course-content-dl/blob/main/tutorials/static/SurveyButton.png?raw=1\" alt=\"button link to survey\" style=\"width:410px\"></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "---\n",
    "# Bonus: Putting it all together\n",
    "\n",
    "*Time estimate: ~40 mins*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "We have progressively built a sophisticated optimization algorithm, which is able to deal with a non-convex, poor-conditioned problem concerning tens of thousands of training examples. Now we present _you_ with a small challenge: beat us! :P\n",
    "\n",
    "Your mission is to train an MLP model that can compete with a benchmark model which we have pre-trained for you. In this section you will be able to use the full Pytorch power: loading the data, defining the model, sampling minibatches as well as Pytorch's **optimizer implementations**.\n",
    "\n",
    "There is a big engineering component behind the design of optimizers and their implementation can sometimes become tricky. So unless you are directly doing research in optimization, it's recommended to use an implementation provided by a widely reviewed open-source library."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Video 9: Putting it all together\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "remove-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Video 9: Putting it all together\n",
    "from ipywidgets import widgets\n",
    "from IPython.display import YouTubeVideo\n",
    "from IPython.display import IFrame\n",
    "from IPython.display import display\n",
    "\n",
    "\n",
    "class PlayVideo(IFrame):\n",
    "  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n",
    "    self.id = id\n",
    "    if source == 'Bilibili':\n",
    "      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n",
    "    elif source == 'Osf':\n",
    "      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n",
    "    super(PlayVideo, self).__init__(src, width, height, **kwargs)\n",
    "\n",
    "\n",
    "def display_videos(video_ids, W=400, H=300, fs=1):\n",
    "  tab_contents = []\n",
    "  for i, video_id in enumerate(video_ids):\n",
    "    out = widgets.Output()\n",
    "    with out:\n",
    "      if video_ids[i][0] == 'Youtube':\n",
    "        video = YouTubeVideo(id=video_ids[i][1], width=W,\n",
    "                             height=H, fs=fs, rel=0)\n",
    "        print(f'Video available at https://youtube.com/watch?v={video.id}')\n",
    "      else:\n",
    "        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n",
    "                          height=H, fs=fs, autoplay=False)\n",
    "        if video_ids[i][0] == 'Bilibili':\n",
    "          print(f'Video available at https://www.bilibili.com/video/{video.id}')\n",
    "        elif video_ids[i][0] == 'Osf':\n",
    "          print(f'Video available at https://osf.io/{video.id}')\n",
    "      display(video)\n",
    "    tab_contents.append(out)\n",
    "  return tab_contents\n",
    "\n",
    "\n",
    "video_ids = [('Youtube', 'DP9c13vLiOM'), ('Bilibili', 'BV1MK4y1u7u2')]\n",
    "tab_contents = display_videos(video_ids, W=730, H=410)\n",
    "tabs = widgets.Tab()\n",
    "tabs.children = tab_contents\n",
    "for i in range(len(tab_contents)):\n",
    "  tabs.set_title(i, video_ids[i][0])\n",
    "display(tabs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Putting_it_all_together_Bonus_Video\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Download parameters of the benchmark model\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Download parameters of the benchmark model\n",
    "import requests\n",
    "\n",
    "fname = 'benchmark_model.pt'\n",
    "url = \"https://osf.io/sj4e8/download\"\n",
    "r = requests.get(url, allow_redirects=True)\n",
    "with open(fname, 'wb') as fh:\n",
    "  fh.write(r.content)\n",
    "\n",
    "# Load the benchmark model's parameters\n",
    "DEVICE = set_device()\n",
    "if DEVICE == \"cuda\":\n",
    "  benchmark_state_dict = torch.load(fname)\n",
    "else:\n",
    "  benchmark_state_dict = torch.load(fname, map_location=torch.device('cpu'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {}
   },
   "outputs": [],
   "source": [
    "# Create MLP object and update weights with those of saved model\n",
    "benchmark_model = MLP(in_dim=784, out_dim=10,\n",
    "                      hidden_dims=[200, 100, 50]).to(DEVICE)\n",
    "benchmark_model.load_state_dict(benchmark_state_dict)\n",
    "\n",
    "\n",
    "# Define helper function to evaluate models\n",
    "def eval_model(model, data_loader, num_batches=np.inf, device='cpu'):\n",
    "  \"\"\"\n",
    "  To evaluate a given model\n",
    "\n",
    "  Args:\n",
    "    model: nn.Module derived class\n",
    "      The model which is to be evaluated\n",
    "    data_loader: Iterable\n",
    "      A configured dataloading utility\n",
    "    num_batches: Integer\n",
    "      Size of minibatches\n",
    "    device: String\n",
    "      Sets the device. CUDA if available, CPU otherwise\n",
    "\n",
    "  Returns:\n",
    "    mean of log loss and mean of log accuracy\n",
    "  \"\"\"\n",
    "\n",
    "  loss_log, acc_log = [], []\n",
    "  model.to(device=device)\n",
    "\n",
    "  # We are just evaluating the model, no need to compute gradients\n",
    "  with torch.no_grad():\n",
    "    for batch_id, batch in enumerate(data_loader):\n",
    "      # If we only evaluate a number of batches, stop after we reach that number\n",
    "      if batch_id > num_batches:\n",
    "        break\n",
    "      # Extract minibatch data\n",
    "      data, labels = batch[0].to(device), batch[1].to(device)\n",
    "      # Evaluate model and loss on minibatch\n",
    "      preds = model(data)\n",
    "      loss_log.append(loss_fn(preds, labels).item())\n",
    "      acc_log.append(torch.mean(1. * (preds.argmax(dim=1) == labels)).item())\n",
    "\n",
    "  return np.mean(loss_log), np.mean(acc_log)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "We define an optimizer in the following steps:\n",
    "\n",
    "1. Load  the corresponding class that implements the parameter updates and other internal management activities, including:\n",
    "    - create auxiliary variables,\n",
    "    - update moving averages,\n",
    "    - adjust the learning rate.\n",
    "2. Pass the parameters of the Pytorch model that the optimizer has control over. Note that different optimizers can potentially control different parameter groups.\n",
    "3. Specify hyperparameters, including learning rate, momentum, moving average factors, etc.\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Exercise Bonus: Train your own model\n",
    "\n",
    "Now, train the model with your preferred optimizer and find a good combination of hyperparameter settings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {}
   },
   "outputs": [],
   "source": [
    "#################################################\n",
    "## TODO for students: adjust training settings ##\n",
    "\n",
    "# The three parameters below are in your full control\n",
    "MAX_EPOCHS = 2  # select number of epochs to train\n",
    "LR = 1e-5  # choose the step size\n",
    "BATCH_SIZE = 64  # number of examples per minibatch\n",
    "\n",
    "# Define the model and associated optimizer -- you may change its architecture!\n",
    "my_model = MLP(in_dim=784, out_dim=10, hidden_dims=[200, 100, 50]).to(DEVICE)\n",
    "\n",
    "# You can take your pick from many different optimizers\n",
    "# Check the optimizer documentation and hyperparameter meaning before using!\n",
    "# More details on Pytorch optimizers: https://pytorch.org/docs/stable/optim.html\n",
    "# optimizer = torch.optim.SGD(my_model.parameters(), lr=LR, momentum=0.9)\n",
    "# optimizer = torch.optim.RMSprop(my_model.parameters(), lr=LR, alpha=0.99)\n",
    "# optimizer = torch.optim.Adagrad(my_model.parameters(), lr=LR)\n",
    "optimizer = torch.optim.Adam(my_model.parameters(), lr=LR)\n",
    "#################################################"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {}
   },
   "outputs": [],
   "source": [
    "set_seed(seed=SEED)\n",
    "# Print training stats every LOG_FREQ minibatches\n",
    "LOG_FREQ = 200\n",
    "# Frequency for evaluating the validation metrics\n",
    "VAL_FREQ = 200\n",
    "# Load data using a Pytorch Dataset\n",
    "train_set_orig, test_set_orig = load_mnist_data(change_tensors=False)\n",
    "\n",
    "# We separate 10,000 training samples to create a validation set\n",
    "train_set_orig, val_set_orig = torch.utils.data.random_split(train_set_orig, [50000, 10000])\n",
    "\n",
    "# Create the corresponding DataLoaders for training and test\n",
    "g_seed = torch.Generator()\n",
    "g_seed.manual_seed(SEED)\n",
    "\n",
    "train_loader = torch.utils.data.DataLoader(train_set_orig,\n",
    "                                           shuffle=True,\n",
    "                                           batch_size=BATCH_SIZE,\n",
    "                                           num_workers=2,\n",
    "                                           worker_init_fn=seed_worker,\n",
    "                                           generator=g_seed)\n",
    "val_loader = torch.utils.data.DataLoader(val_set_orig,\n",
    "                                         shuffle=True,\n",
    "                                         batch_size=256,\n",
    "                                         num_workers=2,\n",
    "                                         worker_init_fn=seed_worker,\n",
    "                                         generator=g_seed)\n",
    "test_loader = torch.utils.data.DataLoader(test_set_orig,\n",
    "                                          batch_size=256,\n",
    "                                          num_workers=2,\n",
    "                                          worker_init_fn=seed_worker,\n",
    "                                          generator=g_seed)\n",
    "\n",
    "# Run training\n",
    "metrics = {'train_loss':[],\n",
    "           'train_acc':[],\n",
    "           'val_loss':[],\n",
    "           'val_acc':[],\n",
    "           'val_idx':[]}\n",
    "\n",
    "step_idx = 0\n",
    "for epoch in tqdm(range(MAX_EPOCHS)):\n",
    "\n",
    "  running_loss, running_acc = 0., 0.\n",
    "\n",
    "  for batch_id, batch in enumerate(train_loader):\n",
    "    step_idx += 1\n",
    "    # Extract minibatch data and labels\n",
    "    data, labels = batch[0].to(DEVICE), batch[1].to(DEVICE)\n",
    "    # Just like before, refresh gradient accumulators.\n",
    "    # Note that this is now a method of the optimizer.\n",
    "    optimizer.zero_grad()\n",
    "    # Evaluate model and loss on minibatch\n",
    "    preds = my_model(data)\n",
    "    loss = loss_fn(preds, labels)\n",
    "    acc = torch.mean(1.0 * (preds.argmax(dim=1) == labels))\n",
    "    # Compute gradients\n",
    "    loss.backward()\n",
    "    # Update parameters\n",
    "    # Note how all the magic in the update of the parameters is encapsulated by\n",
    "    # the optimizer class.\n",
    "    optimizer.step()\n",
    "    # Log metrics for plotting\n",
    "    metrics['train_loss'].append(loss.cpu().item())\n",
    "    metrics['train_acc'].append(acc.cpu().item())\n",
    "\n",
    "    if batch_id % VAL_FREQ == (VAL_FREQ - 1):\n",
    "      # Get an estimate of the validation accuracy with 100 batches\n",
    "      val_loss, val_acc = eval_model(my_model, val_loader,\n",
    "                                     num_batches=100,\n",
    "                                     device=DEVICE)\n",
    "      metrics['val_idx'].append(step_idx)\n",
    "      metrics['val_loss'].append(val_loss)\n",
    "      metrics['val_acc'].append(val_acc)\n",
    "\n",
    "      print(f\"[VALID] Epoch {epoch + 1} - Batch {batch_id + 1} - \"\n",
    "            f\"Loss: {val_loss:.3f} - Acc: {100*val_acc:.3f}%\")\n",
    "\n",
    "    # print statistics\n",
    "    running_loss += loss.cpu().item()\n",
    "    running_acc += acc.cpu().item()\n",
    "    # Print every LOG_FREQ minibatches\n",
    "    if batch_id % LOG_FREQ == (LOG_FREQ-1):\n",
    "      print(f\"[TRAIN] Epoch {epoch + 1} - Batch {batch_id + 1} - \"\n",
    "            f\"Loss: {running_loss / LOG_FREQ:.3f} - \"\n",
    "            f\"Acc: {100 * running_acc / LOG_FREQ:.3f}%\")\n",
    "\n",
    "      running_loss, running_acc = 0., 0."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {}
   },
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(1, 2, figsize=(10, 4))\n",
    "\n",
    "ax[0].plot(range(len(metrics['train_loss'])), metrics['train_loss'],\n",
    "           alpha=0.8, label='Train')\n",
    "ax[0].plot(metrics['val_idx'], metrics['val_loss'], label='Valid')\n",
    "ax[0].set_xlabel('Iteration')\n",
    "ax[0].set_ylabel('Loss')\n",
    "ax[0].legend()\n",
    "\n",
    "ax[1].plot(range(len(metrics['train_acc'])), metrics['train_acc'],\n",
    "           alpha=0.8, label='Train')\n",
    "ax[1].plot(metrics['val_idx'], metrics['val_acc'], label='Valid')\n",
    "ax[1].set_xlabel('Iteration')\n",
    "ax[1].set_ylabel('Accuracy')\n",
    "ax[1].legend()\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Train_your_own_model_Bonus_Exercise\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "## Think! Bonus: Metrics\n",
    "\n",
    "Which metric did you optimize when searching for the right configuration? The training set loss? Accuracy? Validation/test set metrics? Why? Discuss!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "execution": {}
   },
   "source": [
    "[*Click for solution*](https://github.com/NeuromatchAcademy/course-content-dl/tree/main/tutorials/W1D5_Optimization/solutions/W1D5_Tutorial1_Solution_093a66ad.py)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Submit your feedback\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "execution": {},
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# @title Submit your feedback\n",
    "content_review(f\"{feedback_prefix}_Metrics_Bonus_Discussion\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {}
   },
   "source": [
    "### Evaluation\n",
    "\n",
    "We _finally_ can evaluate and compare the performance of the models on previously unseen examples.\n",
    "\n",
    "Which model would you keep? (\\*drum roll*)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {}
   },
   "outputs": [],
   "source": [
    "print('Your model...')\n",
    "train_loss, train_accuracy = eval_model(my_model, train_loader, device=DEVICE)\n",
    "test_loss, test_accuracy = eval_model(my_model, test_loader, device=DEVICE)\n",
    "print(f'Train Loss {train_loss:.3f} / Test Loss {test_loss:.3f}')\n",
    "print(f'Train Accuracy {100*train_accuracy:.3f}% / Test Accuracy {100*test_accuracy:.3f}%')\n",
    "\n",
    "print('\\nBenchmark model')\n",
    "train_loss, train_accuracy = eval_model(benchmark_model, train_loader, device=DEVICE)\n",
    "test_loss, test_accuracy = eval_model(benchmark_model, test_loader, device=DEVICE)\n",
    "print(f'Train Loss {train_loss:.3f} / Test Loss {test_loss:.3f}')\n",
    "print(f'Train Accuracy {100*train_accuracy:.3f}% / Test Accuracy {100*test_accuracy:.3f}%')"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "include_colab_link": true,
   "name": "W1D5_Tutorial1",
   "provenance": [],
   "toc_visible": true
  },
  "kernel": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.11"
  },
  "toc-autonumbering": true,
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}