{ "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text", "execution": {}, "id": "view-in-github" }, "source": [ "\"Open   \"Open" ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "# Tutorial 2: Deep MLPs\n", "\n", "**Week 1, Day 3: Multi Layer Perceptrons**\n", "\n", "**By Neuromatch Academy**\n", "\n", "__Content creators:__ Arash Ash, Surya Ganguli\n", "\n", "__Content reviewers:__ Saeed Salehi, Felix Bartsch, Yu-Fang Yang, Melvin Selim Atay, Kelson Shilling-Scrivo\n", "\n", "__Content editors:__ Gagana B, Kelson Shilling-Scrivo, Spiros Chavlis\n", "\n", "__Production editors:__ Anoop Kulkarni, Kelson Shilling-Scrivo, Gagana B, Spiros Chavlis" ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "---\n", "# Tutorial Objectives\n", "In this tutorial, we will dive deeper into MLPs and see more of their mathematical and practical aspects. Today we are going to see why MLPs:\n", "* Can be deep or wide\n", "* Dependant on transfer functions\n", "* Sensitive to initialization" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "remove-input" ] }, "outputs": [], "source": [ "# @markdown\n", "from IPython.display import IFrame\n", "from ipywidgets import widgets\n", "out = widgets.Output()\n", "with out:\n", " print(f\"If you want to download the slides: https://osf.io/download/ed65b/\")\n", " display(IFrame(src=f\"https://mfr.ca-1.osf.io/render?url=https://osf.io/ed65b/?direct%26mode=render%26action=download%26mode=render\", width=730, height=410))\n", "display(out)" ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "---\n", "# Setup\n", "\n", "This is a GPU free notebook!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Install and import feedback gadget\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "hide-input" ] }, "outputs": [], "source": [ "# @title Install and import feedback gadget\n", "\n", "!pip3 install vibecheck datatops --quiet\n", "\n", "from vibecheck import DatatopsContentReviewContainer\n", "def content_review(notebook_section: str):\n", " return DatatopsContentReviewContainer(\n", " \"\", # No text prompt\n", " notebook_section,\n", " {\n", " \"url\": \"https://pmyvdlilci.execute-api.us-east-1.amazonaws.com/klab\",\n", " \"name\": \"neuromatch_dl\",\n", " \"user_key\": \"f379rz8y\",\n", " },\n", " ).render()\n", "\n", "\n", "feedback_prefix = \"W1D3_T2\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": {} }, "outputs": [], "source": [ "# Imports\n", "import pathlib\n", "\n", "import torch\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "import torch.nn as nn\n", "import torch.optim as optim\n", "\n", "from torchvision.utils import make_grid\n", "import torchvision.transforms as transforms\n", "from torchvision.datasets import ImageFolder\n", "from torch.utils.data import DataLoader, TensorDataset\n", "\n", "from tqdm.auto import tqdm\n", "from IPython.display import display" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Figure Settings\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "hide-input" ] }, "outputs": [], "source": [ "# @title Figure Settings\n", "import logging\n", "logging.getLogger('matplotlib.font_manager').disabled = True\n", "\n", "import ipywidgets as widgets # Interactive display\n", "%config InlineBackend.figure_format = 'retina'\n", "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/content-creation/main/nma.mplstyle\")\n", "my_layout = widgets.Layout()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Helper functions (MLP Tutorial 1 Codes)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "hide-input" ] }, "outputs": [], "source": [ "# @title Helper functions (MLP Tutorial 1 Codes)\n", "\n", "# @markdown `Net(nn.Module)`\n", "\n", "class Net(nn.Module):\n", " \"\"\"\n", " Simulate MLP Network\n", " \"\"\"\n", "\n", " def __init__(self, actv, input_feature_num, hidden_unit_nums, output_feature_num):\n", " \"\"\"\n", " Initialize MLP Network parameters\n", "\n", " Args:\n", " actv: string\n", " Activation function\n", " input_feature_num: int\n", " Number of input features\n", " hidden_unit_nums: list\n", " Number of units per hidden layer. List of integers\n", " output_feature_num: int\n", " Number of output features\n", "\n", " Returns:\n", " Nothing\n", " \"\"\"\n", " super(Net, self).__init__()\n", " self.input_feature_num = input_feature_num # Save the input size for reshapinng later\n", " self.mlp = nn.Sequential() # Initialize layers of MLP\n", "\n", " in_num = input_feature_num # Initialize the temporary input feature to each layer\n", " for i in range(len(hidden_unit_nums)): # Loop over layers and create each one\n", " out_num = hidden_unit_nums[i] # Assign the current layer hidden unit from list\n", " layer = nn.Linear(in_num, out_num) # Use nn.Linear to define the layer\n", " in_num = out_num # Assign next layer input using current layer output\n", " self.mlp.add_module(f\"Linear_{i}\", layer) # Append layer to the model with a name\n", "\n", " actv_layer = eval(f\"nn.{actv}\") # Assign activation function (eval allows us to instantiate object from string)\n", " self.mlp.add_module(f\"Activation_{i}\", actv_layer) # Append activation to the model with a name\n", "\n", " out_layer = nn.Linear(in_num, output_feature_num) # Create final layer\n", " self.mlp.add_module('Output_Linear', out_layer) # Append the final layer\n", "\n", " def forward(self, x):\n", " \"\"\"\n", " Simulate forward pass of MLP Network\n", "\n", " Args:\n", " x: torch.tensor\n", " Input data\n", "\n", " Returns:\n", " logits: Instance of MLP\n", " Forward pass of MLP\n", " \"\"\"\n", " # Reshape inputs to (batch_size, input_feature_num)\n", " # Just in case the input vector is not 2D, like an image!\n", " x = x.view(-1, self.input_feature_num)\n", "\n", " logits = self.mlp(x) # forward pass of MLP\n", " return logits\n", "\n", "\n", "# @markdown `train_test_classification(net, criterion, optimizer, train_loader, test_loader, num_epochs=1, verbose=True, training_plot=False)`\n", "def train_test_classification(net, criterion, optimizer, train_loader,\n", " test_loader, num_epochs=1, verbose=True,\n", " training_plot=False, device='cpu'):\n", " \"\"\"\n", " Accumulate training loss/Evaluate performance\n", "\n", " Args:\n", " net: Instance of Net class\n", " Describes the model with ReLU activation, batch size 128\n", " criterion: torch.nn type\n", " Criterion combines LogSoftmax and NLLLoss in one single class.\n", " optimizer: torch.optim type\n", " Implements Adam algorithm.\n", " train_loader: torch.utils.data type\n", " Combines the train dataset and sampler, and provides an iterable over the given dataset.\n", " test_loader: torch.utils.data type\n", " Combines the test dataset and sampler, and provides an iterable over the given dataset.\n", " num_epochs: int\n", " Number of epochs [default: 1]\n", " verbose: boolean\n", " If True, print statistics\n", " training_plot=False\n", " If True, display training plot\n", " device: string\n", " CUDA/GPU if available, CPU otherwise\n", "\n", " Returns:\n", " Nothing\n", " \"\"\"\n", " net.to(device)\n", " net.train()\n", " training_losses = []\n", " for epoch in tqdm(range(num_epochs)): # Loop over the dataset multiple times\n", " running_loss = 0.0\n", " for i, data in enumerate(train_loader, 0):\n", " # Get the inputs; data is a list of [inputs, labels]\n", " inputs, labels = data\n", " inputs = inputs.to(device).float()\n", " labels = labels.to(device).long()\n", "\n", " # Zero the parameter gradients\n", " optimizer.zero_grad()\n", "\n", " # forward + backward + optimize\n", " outputs = net(inputs)\n", "\n", " loss = criterion(outputs, labels)\n", " loss.backward()\n", " optimizer.step()\n", "\n", " # Print statistics\n", " if verbose:\n", " training_losses += [loss.item()]\n", "\n", " net.eval()\n", "\n", " def test(data_loader):\n", " \"\"\"\n", " Function to gauge network performance\n", "\n", " Args:\n", " data_loader: torch.utils.data type\n", " Combines the test dataset and sampler, and provides an iterable over the given dataset.\n", "\n", " Returns:\n", " acc: float\n", " Performance of the network\n", " total: int\n", " Number of datapoints in the dataloader\n", " \"\"\"\n", " correct = 0\n", " total = 0\n", " for data in data_loader:\n", " inputs, labels = data\n", " inputs = inputs.to(device).float()\n", " labels = labels.to(device).long()\n", "\n", " outputs = net(inputs)\n", " _, predicted = torch.max(outputs, 1)\n", " total += labels.size(0)\n", " correct += (predicted == labels).sum().item()\n", "\n", " acc = 100 * correct / total\n", " return total, acc\n", "\n", " train_total, train_acc = test(train_loader)\n", " test_total, test_acc = test(test_loader)\n", "\n", " if verbose:\n", " print(f'\\nAccuracy on the {train_total} training samples: {train_acc:0.2f}')\n", " print(f'Accuracy on the {test_total} testing samples: {test_acc:0.2f}\\n')\n", "\n", " if training_plot:\n", " plt.plot(training_losses)\n", " plt.xlabel('Batch')\n", " plt.ylabel('Training loss')\n", " plt.show()\n", "\n", " return train_acc, test_acc\n", "\n", "\n", "# @markdown `shuffle_and_split_data(X, y, seed)`\n", "def shuffle_and_split_data(X, y, seed):\n", " \"\"\"\n", " Helper function to shuffle and split data\n", "\n", " Args:\n", " X: torch.tensor\n", " Input data\n", " y: torch.tensor\n", " Corresponding target variables\n", " seed: int\n", " Set seed for reproducibility\n", "\n", " Returns:\n", " X_test: torch.tensor\n", " Test data [20% of X]\n", " y_test: torch.tensor\n", " Labels corresponding to above mentioned test data\n", " X_train: torch.tensor\n", " Train data [80% of X]\n", " y_train: torch.tensor\n", " Labels corresponding to above mentioned train data\n", " \"\"\"\n", " # Set seed for reproducibility\n", " torch.manual_seed(seed)\n", " # Number of samples\n", " N = X.shape[0]\n", " # Shuffle data\n", " shuffled_indices = torch.randperm(N) # Get indices to shuffle data, could use torch.randperm\n", " X = X[shuffled_indices]\n", " y = y[shuffled_indices]\n", "\n", " # Split data into train/test\n", " test_size = int(0.2 * N) # Assign test datset size using 20% of samples\n", " X_test = X[:test_size]\n", " y_test = y[:test_size]\n", " X_train = X[test_size:]\n", " y_train = y[test_size:]\n", "\n", " return X_test, y_test, X_train, y_train" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plotting functions\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "hide-input" ] }, "outputs": [], "source": [ "# @title Plotting functions\n", "def imshow(img):\n", " \"\"\"\n", " Helper function to plot unnormalised image\n", "\n", " Args:\n", " img: torch.tensor\n", " Image to be displayed\n", "\n", " Returns:\n", " Nothing\n", " \"\"\"\n", " img = img / 2 + 0.5 # Unnormalize\n", " npimg = img.numpy()\n", " plt.imshow(np.transpose(npimg, (1, 2, 0)))\n", " plt.axis(False)\n", " plt.show()\n", "\n", "\n", "def sample_grid(M=500, x_max=2.0):\n", " \"\"\"\n", " Helper function to simulate sample meshgrid\n", "\n", " Args:\n", " M: int\n", " Size of the constructed tensor with meshgrid\n", " x_max: float\n", " Defines range for the set of points\n", "\n", " Returns:\n", " X_all: torch.tensor\n", " Concatenated meshgrid tensor\n", " \"\"\"\n", " ii, jj = torch.meshgrid(torch.linspace(-x_max, x_max,M),\n", " torch.linspace(-x_max, x_max, M),\n", " indexing='ij')\n", " X_all = torch.cat([ii.unsqueeze(-1),\n", " jj.unsqueeze(-1)],\n", " dim=-1).view(-1, 2)\n", " return X_all\n", "\n", "\n", "def plot_decision_map(X_all, y_pred, X_test, y_test,\n", " M=500, x_max=2.0, eps=1e-3):\n", " \"\"\"\n", " Helper function to plot decision map\n", "\n", " Args:\n", " X_all: torch.tensor\n", " Concatenated meshgrid tensor\n", " y_pred: torch.tensor\n", " Labels predicted by the network\n", " X_test: torch.tensor\n", " Test data\n", " y_test: torch.tensor\n", " Labels of the test data\n", " M: int\n", " Size of the constructed tensor with meshgrid\n", " x_max: float\n", " Defines range for the set of points\n", " eps: float\n", " Decision threshold\n", "\n", " Returns:\n", " Nothing\n", " \"\"\"\n", " decision_map = torch.argmax(y_pred, dim=1)\n", "\n", " for i in range(len(X_test)):\n", " indeces = (X_all[:, 0] - X_test[i, 0])**2 + (X_all[:, 1] - X_test[i, 1])**2 < eps\n", " decision_map[indeces] = (K + y_test[i]).long()\n", "\n", " decision_map = decision_map.view(M, M).cpu()\n", " plt.imshow(decision_map, extent=[-x_max, x_max, -x_max, x_max], cmap='jet')\n", " plt.axis('off')\n", " plt.plot()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Set random seed\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Executing `set_seed(seed=seed)` you are setting the seed\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "hide-input" ] }, "outputs": [], "source": [ "# @title Set random seed\n", "\n", "# @markdown Executing `set_seed(seed=seed)` you are setting the seed\n", "\n", "# For DL its critical to set the random seed so that students can have a\n", "# baseline to compare their results to expected results.\n", "# Read more here: https://pytorch.org/docs/stable/notes/randomness.html\n", "\n", "# Call `set_seed` function in the exercises to ensure reproducibility.\n", "import random\n", "import torch\n", "\n", "def set_seed(seed=None, seed_torch=True):\n", " \"\"\"\n", " Function that controls randomness. NumPy and random modules must be imported.\n", "\n", " Args:\n", " seed : Integer\n", " A non-negative integer that defines the random state. Default is `None`.\n", " seed_torch : Boolean\n", " If `True` sets the random seed for pytorch tensors, so pytorch module\n", " must be imported. Default is `True`.\n", "\n", " Returns:\n", " Nothing.\n", " \"\"\"\n", " if seed is None:\n", " seed = np.random.choice(2 ** 32)\n", " random.seed(seed)\n", " np.random.seed(seed)\n", " if seed_torch:\n", " torch.manual_seed(seed)\n", " torch.cuda.manual_seed_all(seed)\n", " torch.cuda.manual_seed(seed)\n", " torch.backends.cudnn.benchmark = False\n", " torch.backends.cudnn.deterministic = True\n", "\n", " print(f'Random seed {seed} has been set.')\n", "\n", "\n", "# In case that `DataLoader` is used\n", "def seed_worker(worker_id):\n", " \"\"\"\n", " DataLoader will reseed workers following randomness in\n", " multi-process data loading algorithm.\n", "\n", " Args:\n", " worker_id: integer\n", " ID of subprocess to seed. 0 means that\n", " the data will be loaded in the main process\n", " Refer: https://pytorch.org/docs/stable/data.html#data-loading-randomness for more details\n", "\n", " Returns:\n", " Nothing\n", " \"\"\"\n", " worker_seed = torch.initial_seed() % 2**32\n", " np.random.seed(worker_seed)\n", " random.seed(worker_seed)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Set device (GPU or CPU). Execute `set_device()`\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "hide-input" ] }, "outputs": [], "source": [ "# @title Set device (GPU or CPU). Execute `set_device()`\n", "# especially if torch modules used.\n", "\n", "# Inform the user if the notebook uses GPU or CPU.\n", "\n", "def set_device():\n", " \"\"\"\n", " Set the device. CUDA if available, CPU otherwise\n", "\n", " Args:\n", " None\n", "\n", " Returns:\n", " Nothing\n", " \"\"\"\n", " device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", " if device != \"cuda\":\n", " print(\"GPU is not enabled in this notebook. \\n\"\n", " \"If you want to enable it, in the menu under `Runtime` -> \\n\"\n", " \"`Hardware accelerator.` and select `GPU` from the dropdown menu\")\n", " else:\n", " print(\"GPU is enabled in this notebook. \\n\"\n", " \"If you want to disable it, in the menu under `Runtime` -> \\n\"\n", " \"`Hardware accelerator.` and select `None` from the dropdown menu\")\n", "\n", " return device" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": {} }, "outputs": [], "source": [ "SEED = 2021\n", "set_seed(seed=SEED)\n", "DEVICE = set_device()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Download of the Animal Faces dataset\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Animal faces consists of 16,130 32x32 images belonging to 3 classes\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "hide-input" ] }, "outputs": [], "source": [ "# @title Download of the Animal Faces dataset\n", "# @markdown Animal faces consists of 16,130 32x32 images belonging to 3 classes\n", "import requests, os\n", "from zipfile import ZipFile\n", "\n", "print(\"Start downloading and unzipping `AnimalFaces` dataset...\")\n", "name = 'AnimalFaces32x32'\n", "fname = f\"{name}.zip\"\n", "url = f\"https://osf.io/kgfvj/download\"\n", "r = requests.get(url, allow_redirects=True)\n", "with open(fname, 'wb') as fh:\n", " fh.write(r.content)\n", "\n", "with ZipFile(fname, 'r') as zfile:\n", " zfile.extractall(f\"./{name}\")\n", "\n", "if os.path.exists(fname):\n", " os.remove(fname)\n", "else:\n", " print(f\"The file {fname} does not exist\")\n", "\n", "os.chdir(name)\n", "print(\"Download completed.\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Loader\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Execute this cell!\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "hide-input" ] }, "outputs": [], "source": [ "# @title Data Loader\n", "# @markdown Execute this cell!\n", "K = 4\n", "sigma = 0.4\n", "N = 1000\n", "t = torch.linspace(0, 1, N)\n", "X = torch.zeros(K*N, 2)\n", "y = torch.zeros(K*N)\n", "for k in range(K):\n", " X[k*N:(k+1)*N, 0] = t*(torch.sin(2*np.pi/K*(2*t+k)) + sigma**2*torch.randn(N)) # [TO-DO]\n", " X[k*N:(k+1)*N, 1] = t*(torch.cos(2*np.pi/K*(2*t+k)) + sigma**2*torch.randn(N)) # [TO-DO]\n", " y[k*N:(k+1)*N] = k\n", "\n", "\n", "X_test, y_test, X_train, y_train = shuffle_and_split_data(X, y, seed=SEED)\n", "\n", "# DataLoader with random seed\n", "batch_size = 128\n", "g_seed = torch.Generator()\n", "g_seed.manual_seed(SEED)\n", "\n", "test_data = TensorDataset(X_test, y_test)\n", "test_loader = DataLoader(test_data, batch_size=batch_size,\n", " shuffle=False, num_workers=0,\n", " worker_init_fn=seed_worker,\n", " generator=g_seed,\n", " )\n", "\n", "train_data = TensorDataset(X_train, y_train)\n", "train_loader = DataLoader(train_data,\n", " batch_size=batch_size,\n", " drop_last=True,\n", " shuffle=True,\n", " worker_init_fn=seed_worker,\n", " generator=g_seed,\n", " )" ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "---\n", "# Section 1: Wider vs deeper networks\n", "\n", "*Time estimate: ~45 mins*" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Video 1: Deep Expressivity\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "remove-input" ] }, "outputs": [], "source": [ "# @title Video 1: Deep Expressivity\n", "from ipywidgets import widgets\n", "from IPython.display import YouTubeVideo\n", "from IPython.display import IFrame\n", "from IPython.display import display\n", "\n", "\n", "class PlayVideo(IFrame):\n", " def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n", " self.id = id\n", " if source == 'Bilibili':\n", " src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n", " elif source == 'Osf':\n", " src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n", " super(PlayVideo, self).__init__(src, width, height, **kwargs)\n", "\n", "\n", "def display_videos(video_ids, W=400, H=300, fs=1):\n", " tab_contents = []\n", " for i, video_id in enumerate(video_ids):\n", " out = widgets.Output()\n", " with out:\n", " if video_ids[i][0] == 'Youtube':\n", " video = YouTubeVideo(id=video_ids[i][1], width=W,\n", " height=H, fs=fs, rel=0)\n", " print(f'Video available at https://youtube.com/watch?v={video.id}')\n", " else:\n", " video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n", " height=H, fs=fs, autoplay=False)\n", " if video_ids[i][0] == 'Bilibili':\n", " print(f'Video available at https://www.bilibili.com/video/{video.id}')\n", " elif video_ids[i][0] == 'Osf':\n", " print(f'Video available at https://osf.io/{video.id}')\n", " display(video)\n", " tab_contents.append(out)\n", " return tab_contents\n", "\n", "\n", "video_ids = [('Youtube', 'g8JuGrNk9ag'), ('Bilibili', 'BV19f4y157vG')]\n", "tab_contents = display_videos(video_ids, W=730, H=410)\n", "tabs = widgets.Tab()\n", "tabs.children = tab_contents\n", "for i in range(len(tab_contents)):\n", " tabs.set_title(i, video_ids[i][0])\n", "display(tabs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Submit your feedback\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "hide-input" ] }, "outputs": [], "source": [ "# @title Submit your feedback\n", "content_review(f\"{feedback_prefix}_Deep_Expressivity_Video\")" ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "## Coding Exercise 1: Wide vs. Deep while keeping number of parameters same\n", "Let's find the optimal number of hidden layers under a fixed number of parameters constraint!\n", "But first, we need a model parameter counter. You could iterate over the model layers by calling `.parameters()` and then use `.numel()` to count the layer parameters. Also, you can use [`requires_grad`](https://pytorch.org/docs/stable/notes/autograd.html) attribute to make sure it's a trainable parameter. E.g.,\n", "```python\n", "x = torch.ones(10, 5, requires_grad=True)\n", "```\n", "After defining the counter function, we will step by step increase the depth and then iterate over the possible number of hidden units (assuming same for all hidden layers); then using our parameter counter choose the number of hidden units that results in overall close to `max_par_count` parameters." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": {} }, "outputs": [], "source": [ "def run_depth_optimizer(max_par_count, max_hidden_layer, device):\n", " \"\"\"\n", " Simulate Depth Optimizer\n", "\n", " Args:\n", " max_par_count: int\n", " Maximum number of hidden units in layer of depth optimizer\n", " max_hidden_layer: int\n", " Maximum number of hidden layers within depth optimizer\n", " device: string\n", " CUDA/GPU if available, CPU otherwise\n", "\n", " Returns:\n", " hidden_layers: int\n", " Number of hidden layers in depth optimizer\n", " test_scores: list\n", " Log of test scores\n", " \"\"\"\n", " ####################################################################\n", " # Fill in all missing code below (...),\n", " # then remove or comment the line below to test your function\n", " raise NotImplementedError(\"Define the depth optimizer function\")\n", " ###################################################################\n", "\n", " def count_parameters(model):\n", " \"\"\"\n", " Function to count model parameters\n", "\n", " Args:\n", " model: instance of Net class\n", " MLP instance\n", "\n", " Returns:\n", " par_count: int\n", " Number of parameters in network\n", " \"\"\"\n", " par_count = 0\n", " for p in model.parameters():\n", " if p.requires_grad:\n", " par_count += ...\n", " return par_count\n", "\n", " # Number of hidden layers to try\n", " hidden_layers = ...\n", "\n", " # Test test score list\n", " test_scores = []\n", "\n", " for hidden_layer in hidden_layers:\n", " # Initialize the hidden units in each hidden layer to be 1\n", " hidden_units = np.ones(hidden_layer, dtype=int)\n", "\n", " # Define the the with hidden units equal to 1\n", " wide_net = Net('ReLU()', X_train.shape[1], hidden_units, K).to(device)\n", " par_count = count_parameters(wide_net)\n", "\n", " # Increment hidden_units and repeat until the par_count reaches the desired count\n", " while par_count < max_par_count:\n", " hidden_units += 1\n", " wide_net = Net('ReLU()', X_train.shape[1], hidden_units, K).to(device)\n", " par_count = ...\n", "\n", " # Train it\n", " criterion = nn.CrossEntropyLoss()\n", " optimizer = optim.Adam(wide_net.parameters(), lr=1e-3)\n", " _, test_acc = train_test_classification(wide_net, criterion, optimizer,\n", " train_loader, test_loader,\n", " num_epochs=100, device=device)\n", " test_scores += [test_acc]\n", "\n", " return hidden_layers, test_scores\n", "\n", "\n", "\n", "set_seed(seed=SEED)\n", "max_par_count = 100\n", "max_hidden_layer = 5\n", "## Uncomment below to test your function\n", "# hidden_layers, test_scores = run_depth_optimizer(max_par_count, max_hidden_layer, DEVICE)\n", "# plt.xlabel('# of hidden layers')\n", "# plt.ylabel('Test accuracy')\n", "# plt.plot(hidden_layers, test_scores)\n", "# plt.show()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "execution": {} }, "source": [ "[*Click for solution*](https://github.com/NeuromatchAcademy/course-content-dl/tree/main/tutorials/W1D3_MultiLayerPerceptrons/solutions/W1D3_Tutorial2_Solution_c8ca804d.py)\n", "\n", "*Example output:*\n", "\n", "Solution hint\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Submit your feedback\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "hide-input" ] }, "outputs": [], "source": [ "# @title Submit your feedback\n", "content_review(f\"{feedback_prefix}_Wide_vs_Deep_Exercise\")" ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "## Think! 1: Why the tradeoff?\n", "Here we see that there is a particular number of hidden layers that is optimum. Why do you think increasing hidden layers after a certain point hurt in this scenario?" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "execution": {} }, "source": [ "[*Click for solution*](https://github.com/NeuromatchAcademy/course-content-dl/tree/main/tutorials/W1D3_MultiLayerPerceptrons/solutions/W1D3_Tutorial2_Solution_bb717836.py)\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Submit your feedback\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "hide-input" ] }, "outputs": [], "source": [ "# @title Submit your feedback\n", "content_review(f\"{feedback_prefix}_Wide_vs_Deep_Discussion\")" ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "## Section 1.1: Where Wide Fails\n", "Let's use the same spiral dataset generated before with two features. And then add more polynomial features (which makes the first layer wider). And finally, train a single linear layer. We could use the same MLP network with no hidden layers (though it would not be called an MLP anymore!).\n", "\n", "Note that we will add polynomial terms upto $P=50$ which means that for every $x_1^n x_2^m$ term, $n+m\\leq P$. Now it's fun math exercise to prove why the total number of polynomial features upto $P$ becomes:\n", "\n", "\\begin{equation}\n", "\\text{# of terms} = \\frac{(P+1)(P+2)}{2}\n", "\\end{equation}\n", "\n", "Also, we don't need the polynomial term with degree zero (which is the constatnt term) since `nn.Linear` layers have bias terms. Therefore we will have one fewer polynomial feature." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": {} }, "outputs": [], "source": [ "def run_poly_classification(poly_degree, device='cpu', seed=0):\n", " \"\"\"\n", " Helper function to run the above defined polynomial classifier\n", "\n", " Args:\n", " poly_degree: int\n", " Degree of the polynomial\n", " device: string\n", " CUDA/GPU if available, CPU otherwise\n", " seed: int\n", " A non-negative integer that defines the random state. Default is 0.\n", "\n", " Returns:\n", " num_features: int\n", " Number of features\n", " \"\"\"\n", "\n", " def make_poly_features(poly_degree, X):\n", " \"\"\"\n", " Function to define the number of polynomial features except the bias term\n", "\n", " Args:\n", " poly_degree: int\n", " Degree of the polynomial\n", " X: torch.tensor\n", " Input data\n", "\n", " Returns:\n", " num_features: int\n", " Number of features\n", " poly_X: torch.tensor\n", " Polynomial term\n", " \"\"\"\n", " num_features = (poly_degree + 1)*(poly_degree + 2) // 2 - 1\n", " poly_X = torch.zeros((X.shape[0], num_features))\n", " count = 0\n", " for i in range(poly_degree+1):\n", " for j in range(poly_degree+1):\n", " # No need to add zero degree since model has biases\n", " if j + i > 0:\n", " if j + i <= poly_degree:\n", " # Define the polynomial term\n", " poly_X[:, count] = X[:, 0]**i * X [:, 1]**j\n", " count += 1\n", " return poly_X, num_features\n", "\n", " poly_X_test, num_features = make_poly_features(poly_degree, X_test)\n", " poly_X_train, _ = make_poly_features(poly_degree, X_train)\n", "\n", " batch_size = 128\n", "\n", " g_seed = torch.Generator()\n", " g_seed.manual_seed(seed)\n", " poly_test_data = TensorDataset(poly_X_test, y_test)\n", " poly_test_loader = DataLoader(poly_test_data,\n", " batch_size=batch_size,\n", " shuffle=False,\n", " num_workers=1,\n", " worker_init_fn=seed_worker,\n", " generator=g_seed)\n", "\n", " poly_train_data = TensorDataset(poly_X_train, y_train)\n", " poly_train_loader = DataLoader(poly_train_data,\n", " batch_size=batch_size,\n", " shuffle=True,\n", " num_workers=1,\n", " worker_init_fn=seed_worker,\n", " generator=g_seed)\n", "\n", " # Define a linear model using MLP class\n", " poly_net = Net('ReLU()', num_features, [], K).to(device)\n", "\n", " # Train it!\n", " criterion = nn.CrossEntropyLoss()\n", " optimizer = optim.Adam(poly_net.parameters(), lr=1e-3)\n", " _, _ = train_test_classification(poly_net, criterion, optimizer,\n", " poly_train_loader, poly_test_loader,\n", " num_epochs=100, device=DEVICE)\n", " # Test it\n", " X_all = sample_grid().to(device)\n", " poly_X_all, _ = make_poly_features(poly_degree, X_all)\n", " y_pred = poly_net(poly_X_all.to(device))\n", "\n", " # Plot it\n", " plot_decision_map(X_all.cpu(), y_pred.cpu(), X_test.cpu(), y_test.cpu())\n", " plt.show()\n", "\n", " return num_features\n", "\n", "\n", "set_seed(seed=SEED)\n", "max_poly_degree = 50\n", "num_features = run_poly_classification(max_poly_degree, DEVICE, SEED)\n", "print(f'Number of features: {num_features}')" ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "### Think! 1.1: Does a wide model generalize well?\n", "\n", "Do you think this model is performing well outside its training distribution? Why?" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "execution": {} }, "source": [ "[*Click for solution*](https://github.com/NeuromatchAcademy/course-content-dl/tree/main/tutorials/W1D3_MultiLayerPerceptrons/solutions/W1D3_Tutorial2_Solution_9a2666ab.py)\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Submit your feedback\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "hide-input" ] }, "outputs": [], "source": [ "# @title Submit your feedback\n", "content_review(f\"{feedback_prefix}_Does_a_wide_model_generalize_well_Discussion\")" ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "---\n", "# Section 2: Deeper MLPs\n", "\n", "*Time estimate: ~55 mins*" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Video 2: Case study\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "remove-input" ] }, "outputs": [], "source": [ "# @title Video 2: Case study\n", "from ipywidgets import widgets\n", "from IPython.display import YouTubeVideo\n", "from IPython.display import IFrame\n", "from IPython.display import display\n", "\n", "\n", "class PlayVideo(IFrame):\n", " def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n", " self.id = id\n", " if source == 'Bilibili':\n", " src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n", " elif source == 'Osf':\n", " src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n", " super(PlayVideo, self).__init__(src, width, height, **kwargs)\n", "\n", "\n", "def display_videos(video_ids, W=400, H=300, fs=1):\n", " tab_contents = []\n", " for i, video_id in enumerate(video_ids):\n", " out = widgets.Output()\n", " with out:\n", " if video_ids[i][0] == 'Youtube':\n", " video = YouTubeVideo(id=video_ids[i][1], width=W,\n", " height=H, fs=fs, rel=0)\n", " print(f'Video available at https://youtube.com/watch?v={video.id}')\n", " else:\n", " video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n", " height=H, fs=fs, autoplay=False)\n", " if video_ids[i][0] == 'Bilibili':\n", " print(f'Video available at https://www.bilibili.com/video/{video.id}')\n", " elif video_ids[i][0] == 'Osf':\n", " print(f'Video available at https://osf.io/{video.id}')\n", " display(video)\n", " tab_contents.append(out)\n", " return tab_contents\n", "\n", "\n", "video_ids = [('Youtube', '3g_OJ6dYE8E'), ('Bilibili', 'BV1FL411n7SH')]\n", "tab_contents = display_videos(video_ids, W=730, H=410)\n", "tabs = widgets.Tab()\n", "tabs.children = tab_contents\n", "for i in range(len(tab_contents)):\n", " tabs.set_title(i, video_ids[i][0])\n", "display(tabs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Submit your feedback\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "hide-input" ] }, "outputs": [], "source": [ "# @title Submit your feedback\n", "content_review(f\"{feedback_prefix}_Case_study_Video\")" ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "## Coding Exercise 2: Dataloader on a real-world dataset\n", "Let's build our first real-world dataset loader with Data Preprocessing and Augmentation! And we will use the Torchvision transforms to do it.\n", "We'd like to have a simple data augmentation with the following steps:\n", "* Random rotation with $10$ degrees (`.RandomRotation`)\n", "* Random horizontal flipping (`.RandomHorizontalFlip`)\n", "and we'd like a preprocessing that:\n", "* makes Pytorch tensors in the range $[0, 1]$ (`.ToTensor`)\n", "* normalizes the input in the range $[-1, 1]$ (.`Normalize`)\n", "
\n", "\n", "**Hint:** For more info on transform, see the [official documentation](https://pytorch.org/vision/stable/transforms.html)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": {} }, "outputs": [], "source": [ "def get_data_loaders(batch_size, seed):\n", " \"\"\"\n", " Helper function to get data loaders\n", "\n", " Args:\n", " batch_size: int\n", " Batch size\n", " seed: int\n", " A non-negative integer that defines the random state.\n", "\n", " Returns:\n", " img_train_loader: torch.utils.data type\n", " Combines the train dataset and sampler, and provides an iterable over the given dataset.\n", " img_test_loader: torch.utils.data type\n", " Combines the test dataset and sampler, and provides an iterable over the given dataset.\n", " \"\"\"\n", " ####################################################################\n", " # Fill in all missing code below (...),\n", " # then remove or comment the line below to test your function\n", " raise NotImplementedError(\"Define the get data loaders function\")\n", " ###################################################################\n", "\n", " # Define the transform done only during training\n", " augmentation_transforms = ...\n", "\n", " # Define the transform done in training and testing (after augmentation)\n", " mean = (0.5, 0.5, 0.5) # defined sequence of means per channel\n", " std = (0.5, 0.5, 0.5) # defined sequence of std deviations per channel\n", " # Note that the transform should normalize each channel: output[channel] = (input[channel] - mean[channel]) / std[channel]\n", " preprocessing_transforms = ...\n", "\n", " # Compose them together\n", " train_transform = transforms.Compose(augmentation_transforms + preprocessing_transforms)\n", " test_transform = transforms.Compose(preprocessing_transforms)\n", "\n", " # Using pathlib to be compatible with all OS's\n", " data_path = pathlib.Path('.')/'afhq'\n", "\n", " # Define the dataset objects (they can load one by one)\n", " img_train_dataset = ImageFolder(data_path/'train', transform=train_transform)\n", " img_test_dataset = ImageFolder(data_path/'val', transform=test_transform)\n", "\n", " g_seed = torch.Generator()\n", " g_seed.manual_seed(seed)\n", " # Define the dataloader objects (they can load batch by batch)\n", " img_train_loader = DataLoader(img_train_dataset,\n", " batch_size=batch_size,\n", " shuffle=True,\n", " worker_init_fn=seed_worker,\n", " generator=g_seed)\n", " # num_workers can be set to higher if running on Colab Pro TPUs to speed up,\n", " # with more than one worker, it will do multithreading to queue batches\n", " img_test_loader = DataLoader(img_test_dataset,\n", " batch_size=batch_size,\n", " shuffle=False,\n", " num_workers=1,\n", " worker_init_fn=seed_worker,\n", " generator=g_seed)\n", "\n", " return img_train_loader, img_test_loader\n", "\n", "\n", "batch_size = 64\n", "set_seed(seed=SEED)\n", "## Uncomment below to test your function\n", "# img_train_loader, img_test_loader = get_data_loaders(batch_size, SEED)\n", "## get some random training images\n", "# dataiter = iter(img_train_loader)\n", "# images, labels = next(dataiter)\n", "## show images\n", "# imshow(make_grid(images, nrow=8))" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "execution": {} }, "source": [ "[*Click for solution*](https://github.com/NeuromatchAcademy/course-content-dl/tree/main/tutorials/W1D3_MultiLayerPerceptrons/solutions/W1D3_Tutorial2_Solution_2e0f7e91.py)\n", "\n", "*Example output:*\n", "\n", "Solution hint\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": {} }, "outputs": [], "source": [ "# Train it\n", "set_seed(seed=SEED)\n", "net = Net('ReLU()', 3*32*32, [64, 64, 64], 3).to(DEVICE)\n", "criterion = nn.CrossEntropyLoss()\n", "optimizer = optim.Adam(net.parameters(), lr=3e-4)\n", "_, _ = train_test_classification(net, criterion, optimizer,\n", " img_train_loader, img_test_loader,\n", " num_epochs=30, device=DEVICE)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": {} }, "outputs": [], "source": [ "# Visualize the feature map\n", "fc1_weights = net.mlp[0].weight.view(64, 3, 32, 32).detach().cpu()\n", "fc1_weights /= torch.max(torch.abs(fc1_weights))\n", "imshow(make_grid(fc1_weights, nrow=8))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Submit your feedback\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "hide-input" ] }, "outputs": [], "source": [ "# @title Submit your feedback\n", "content_review(f\"{feedback_prefix}_Dataloader_real_world_Exercise\")" ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "## Think! 2: Why first layer features are high level?\n", "Even though it's three layers deep, we see distinct animal faces in the first layer feature map. Do you think this MLP has a hierarchical feature representation? Why?" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "execution": {} }, "source": [ "[*Click for solution*](https://github.com/NeuromatchAcademy/course-content-dl/tree/main/tutorials/W1D3_MultiLayerPerceptrons/solutions/W1D3_Tutorial2_Solution_eb2e554f.py)\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Submit your feedback\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "hide-input" ] }, "outputs": [], "source": [ "# @title Submit your feedback\n", "content_review(f\"{feedback_prefix}_Why_first_layer_features_are_high_level_Discussion\")" ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "---\n", "# Section 3: Ethical aspects\n", "\n", "*Time estimate: ~20 mins*" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Video 3: Ethics: Hype in AI\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "remove-input" ] }, "outputs": [], "source": [ "# @title Video 3: Ethics: Hype in AI\n", "from ipywidgets import widgets\n", "from IPython.display import YouTubeVideo\n", "from IPython.display import IFrame\n", "from IPython.display import display\n", "\n", "\n", "class PlayVideo(IFrame):\n", " def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n", " self.id = id\n", " if source == 'Bilibili':\n", " src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n", " elif source == 'Osf':\n", " src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n", " super(PlayVideo, self).__init__(src, width, height, **kwargs)\n", "\n", "\n", "def display_videos(video_ids, W=400, H=300, fs=1):\n", " tab_contents = []\n", " for i, video_id in enumerate(video_ids):\n", " out = widgets.Output()\n", " with out:\n", " if video_ids[i][0] == 'Youtube':\n", " video = YouTubeVideo(id=video_ids[i][1], width=W,\n", " height=H, fs=fs, rel=0)\n", " print(f'Video available at https://youtube.com/watch?v={video.id}')\n", " else:\n", " video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n", " height=H, fs=fs, autoplay=False)\n", " if video_ids[i][0] == 'Bilibili':\n", " print(f'Video available at https://www.bilibili.com/video/{video.id}')\n", " elif video_ids[i][0] == 'Osf':\n", " print(f'Video available at https://osf.io/{video.id}')\n", " display(video)\n", " tab_contents.append(out)\n", " return tab_contents\n", "\n", "\n", "video_ids = [('Youtube', 'ou35QzsKsdc'), ('Bilibili', 'BV1CP4y1s712')]\n", "tab_contents = display_videos(video_ids, W=730, H=410)\n", "tabs = widgets.Tab()\n", "tabs.children = tab_contents\n", "for i in range(len(tab_contents)):\n", " tabs.set_title(i, video_ids[i][0])\n", "display(tabs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Submit your feedback\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "hide-input" ] }, "outputs": [], "source": [ "# @title Submit your feedback\n", "content_review(f\"{feedback_prefix}_Ethics_Hype_in_AI_Video\")" ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "---\n", "# Summary" ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "In the second tutorial of this day, we have dived deeper into MLPs and seen more of their mathematical and practical aspects. More specifically, we have learned about different architectures, i.e., deep, wide, and how they are dependent on the transfer function used. Also, we have learned about the importance of initialization, and we mathematically analyzed two methods for smart initialization." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Video 4: Outro\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "remove-input" ] }, "outputs": [], "source": [ "# @title Video 4: Outro\n", "from ipywidgets import widgets\n", "from IPython.display import YouTubeVideo\n", "from IPython.display import IFrame\n", "from IPython.display import display\n", "\n", "\n", "class PlayVideo(IFrame):\n", " def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n", " self.id = id\n", " if source == 'Bilibili':\n", " src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n", " elif source == 'Osf':\n", " src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n", " super(PlayVideo, self).__init__(src, width, height, **kwargs)\n", "\n", "\n", "def display_videos(video_ids, W=400, H=300, fs=1):\n", " tab_contents = []\n", " for i, video_id in enumerate(video_ids):\n", " out = widgets.Output()\n", " with out:\n", " if video_ids[i][0] == 'Youtube':\n", " video = YouTubeVideo(id=video_ids[i][1], width=W,\n", " height=H, fs=fs, rel=0)\n", " print(f'Video available at https://youtube.com/watch?v={video.id}')\n", " else:\n", " video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n", " height=H, fs=fs, autoplay=False)\n", " if video_ids[i][0] == 'Bilibili':\n", " print(f'Video available at https://www.bilibili.com/video/{video.id}')\n", " elif video_ids[i][0] == 'Osf':\n", " print(f'Video available at https://osf.io/{video.id}')\n", " display(video)\n", " tab_contents.append(out)\n", " return tab_contents\n", "\n", "\n", "video_ids = [('Youtube', '2sEPw4sSfSw'), ('Bilibili', 'BV1Kb4y1r76G')]\n", "tab_contents = display_videos(video_ids, W=730, H=410)\n", "tabs = widgets.Tab()\n", "tabs.children = tab_contents\n", "for i in range(len(tab_contents)):\n", " tabs.set_title(i, video_ids[i][0])\n", "display(tabs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Submit your feedback\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "hide-input" ] }, "outputs": [], "source": [ "# @title Submit your feedback\n", "content_review(f\"{feedback_prefix}_Outro_Video\")" ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "---\n", "# Daily survey\n", "\n", "Don't forget to complete your reflections and content check in the daily survey! Please be patient after logging in as there is\n", "a small delay before you will be redirected to the survey.\n", "\n", "\"button" ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "---\n", "# Bonus: The need for good initialization\n", "In this section, we derive principles for initializing deep networks. We will see that if the weights are too large, then the forward propagation of signals will be chaotic, and the backpropagation of error gradients will explode. On the other hand, if the weights are too small, the forward propagation of signals will be ordered, and the backpropagation of error gradients will vanish. The key idea behind initialization is to choose the weights to be just right, i.e., at the edge between order and chaos. In this section, we derive this edge and show how to compute the correct initial variance of the weights.\n", "\n", "Many of the typical initialization schemes in existing deep learning frameworks implicitly employ this principle of initialization at the edge of chaos. So this section can be safely skipped on first pass and **is, hence, a bonus section**." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Video 5: Need for Good Initialization\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "remove-input" ] }, "outputs": [], "source": [ "# @title Video 5: Need for Good Initialization\n", "from ipywidgets import widgets\n", "from IPython.display import YouTubeVideo\n", "from IPython.display import IFrame\n", "from IPython.display import display\n", "\n", "\n", "class PlayVideo(IFrame):\n", " def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n", " self.id = id\n", " if source == 'Bilibili':\n", " src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n", " elif source == 'Osf':\n", " src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n", " super(PlayVideo, self).__init__(src, width, height, **kwargs)\n", "\n", "\n", "def display_videos(video_ids, W=400, H=300, fs=1):\n", " tab_contents = []\n", " for i, video_id in enumerate(video_ids):\n", " out = widgets.Output()\n", " with out:\n", " if video_ids[i][0] == 'Youtube':\n", " video = YouTubeVideo(id=video_ids[i][1], width=W,\n", " height=H, fs=fs, rel=0)\n", " print(f'Video available at https://youtube.com/watch?v={video.id}')\n", " else:\n", " video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n", " height=H, fs=fs, autoplay=False)\n", " if video_ids[i][0] == 'Bilibili':\n", " print(f'Video available at https://www.bilibili.com/video/{video.id}')\n", " elif video_ids[i][0] == 'Osf':\n", " print(f'Video available at https://osf.io/{video.id}')\n", " display(video)\n", " tab_contents.append(out)\n", " return tab_contents\n", "\n", "\n", "video_ids = [('Youtube', 'W0V2kwHSuUI'), ('Bilibili', 'BV1Qq4y1H7Px')]\n", "tab_contents = display_videos(video_ids, W=730, H=410)\n", "tabs = widgets.Tab()\n", "tabs.children = tab_contents\n", "for i in range(len(tab_contents)):\n", " tabs.set_title(i, video_ids[i][0])\n", "display(tabs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Submit your feedback\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "execution": {}, "tags": [ "hide-input" ] }, "outputs": [], "source": [ "# @title Submit your feedback\n", "content_review(f\"{feedback_prefix}_Need_for_Good_Initialization_Bonus_Video\")" ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "## Xavier initialization\n", "Let us look at the scale distribution of an output (e.g., a hidden variable) $o_i$ for some fully-connected layer without nonlinearities. With $n_{in}$ inputs ($x_j$) and their associated weights $w_{ij}$ for this layer. Then an output is given by,\n", "\n", "\\begin{equation}\n", " o_{i} = \\sum_{j=1}^{n_\\mathrm{in}} w_{ij} x_j\n", "\\end{equation}\n", "\n", "The weights $w_{ij}$ are all drawn independently from the same distribution. Furthermore, let us assume that this distribution has zero mean and variance $\\sigma^2$. Note that this does not mean that the distribution has to be Gaussian, just that the mean and variance need to exist. For now, let us assume that the inputs to the layer $x_j$ also have zero mean and variance $\\gamma^2$ and that they are independent of $w_{ij}$ and independent of each other. In this case, we can compute the mean and variance of $o_i$ as follows:\n", "\n", "\n", "\\begin{align}\n", " E[o_i] &= \\sum_{j=1}^{n_\\mathrm{in}} E[w_{ij} x_j] \\\\ \\\\\n", " &= \\sum_{j=1}^{n_\\mathrm{in}} E[w_{ij}] E[x_j] = 0, \\\\ \\\\ \\\\\n", " \\mathrm{Var}[o_i] &= E[o_i^2] - (E[o_i])^2 \\\\ \\\\\n", " &= \\sum_{j=1}^{n_\\mathrm{in}} E[w^2_{ij} x^2_j] - 0 \\\\ \\\\\n", " &= \\sum_{j=1}^{n_\\mathrm{in}} E[w^2_{ij}] E[x^2_j] \\\\ \\\\\n", " &= n_\\mathrm{in} \\sigma^2 \\gamma^2\n", "\\end{align}\n", "\n", "
\n", "\n", "One way to keep the variance fixed is to set $n_{in}\\sigma^2=1$ . Now consider backpropagation. There we face a similar problem, albeit with gradients being propagated from the layers closer to the output. Using the same reasoning as for forward propagation, we see that the gradients’ variance can blow up unless $n_{out}\\sigma^2=1$ , where $n_{out}$ is the number of outputs of this layer. This leaves us in a dilemma: we cannot possibly satisfy both conditions simultaneously. Instead, we simply try to satisfy:\n", "\n", "\\begin{align}\n", "\\frac{1}{2} (n_\\mathrm{in} + n_\\mathrm{out}) \\sigma^2 = 1 \\text{ or equivalently }\n", "\\sigma = \\sqrt{\\frac{2}{n_\\mathrm{in} + n_\\mathrm{out}}}\n", "\\end{align}\n", "\n", "
\n", "\n", "This is the reasoning underlying the now-standard and practically beneficial Xavier initialization, named after the first author of its creators [Glorot and Bengio, 2010](https://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf). Typically, the Xavier initialization samples weights from a Gaussian distribution with zero mean and variance $\\sigma^2=\\frac{2}{(n_{in}+n_{out})}$,\n", "\n", "\n", "\\begin{equation}\n", " w_{ij} \\sim \\mathcal{N} \\left (\\mu=0, \\sigma=\\sqrt{\\frac{2}{(n_{in}+n_{out})}} \\right)\n", "\\end{equation}\n", "\n", "\n", "We can also adapt Xavier’s intuition to choose the variance when sampling weights from a uniform distribution. Note that the uniform distribution $\\mathcal{U}(−a,a)$ has variance $\\frac{a^2}{3}$. Plugging this into our condition on $\\sigma^2$ yields the suggestion to initialize according to\n", "\n", "\n", "\\begin{equation}\n", "w_{ij} \\sim \\mathcal{U} \\left(-\\sqrt{\\frac{6}{n_\\mathrm{in} + n_\\mathrm{out}}}, \\sqrt{\\frac{6}{n_\\mathrm{in} + n_\\mathrm{out}}}\\right)\n", "\\end{equation}\n", "\n", "\n", "This explanation is mainly taken from [here](https://d2l.ai/chapter_multilayer-perceptrons/numerical-stability-and-init.html)." ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "If you want to see more about initializations and their differences see [here](https://www.deeplearning.ai/ai-notes/initialization/)." ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "## Initialization with transfer function\n", "Let's derive the optimal gain for LeakyReLU following similar steps.\n", "\n", "LeakyReLU is described mathematically:\n", "\n", "\\begin{equation}\n", "f(x)=\\left\\{\n", " \\begin{array}{ll}\n", " \\alpha \\cdot x & \\text { for } x<0 \\\\\n", " x & \\text { for } x \\geq 0\n", " \\end{array}\\right.\n", "\\end{equation}\n", "\n", "where $\\alpha$ controls the angle of the negative slope.\n", "\n", "Considering a single layer with this activation function gives,\n", "\n", "\\begin{align}\n", "o_{i} &= \\sum_{j=1}^{n_\\mathrm{in}} w_{ij} x_j\\\\\n", "z_{i} &= f\\left( o_{i} \\right)\n", "\\end{align}\n", "\n", "where $z_i$ denotes the activation of node $i$.\n", "\n", "The expectation of the output is still zero, i.e., $\\mathbb{E}[f(o_i)=0]$, but the variance changes, and assuming that the probability $P(x < 0) = 0.5$, we have that:\n", "\n", "\n", "\\begin{align}\n", "\\mathrm{Var}[f(o_i)] &= \\mathbb{E}[f(o_i)^2] - \\left( \\mathbb{E}[f(o_i)] \\right)^{2} \\\\ \\\\\n", "&= \\frac{\\mathrm{Var}[o_i] + \\alpha^2 \\mathrm{Var}[o_i]}{2} \\\\ \\\\\n", "&= \\frac{1+\\alpha^2}{2}n_\\mathrm{in} \\sigma^2 \\gamma^2\n", "\\end{align}\n", "\n", "where $\\gamma$ is the variance of the distribution of the inputs $x_j$ and $\\sigma$ is the variance of the distribution of weights $w_{ij}$, as before.\n", "\n", "Therefore, following the rest of derivation as before,\n", "\n", "
\n", "\n", "\\begin{equation}\n", "\\sigma = gain\\sqrt{\\frac{2}{n_\\mathrm{in} + n_\\mathrm{out}}}, \\, \\text{where} \\,\\, gain = \\sqrt{\\frac{2}{1+\\alpha^2}}\n", "\\end{equation}\n", "\n", "As we can see from the derived formula of $\\sigma$, the transfer function we choose is related with the variance of the distribution of the weights. As the negative slope of the LeakyReLU $\\alpha$ becomes larger, the $gain$ becomes smaller and thus, the distribution of the weights is narrower. On the other hand, as $\\alpha$ becomes smaller and smaller, the distribution of the weights is wider. Recall that, we initialize our weights, for example, by sampling from a normal distribution with zero mean and variance $\\sigma^2$." ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "## Best gain for Xavier Initialization with Leaky ReLU\n", "You're probably running out of time, so let me explain what's happening here. We derived a theoretical gain for initialization. But the question is whether it holds in practice? Here we have a setup to confirm our finding. We will try a range of gains and see the empirical optimum and whether it matches our theoretical value!\n", "\n", "If you have time left, you can change the distribution to sample the initial weights from a uniform distribution by changing the `mode` argument." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": {} }, "outputs": [], "source": [ "N = 10 # Number of trials\n", "gains = np.linspace(1/N, 3.0, N)\n", "test_accs = []\n", "train_accs = []\n", "mode = 'uniform'\n", "for gain in gains:\n", " print(f'\\ngain: {gain:.2f}')\n", "\n", " def init_weights(m, mode='normal'):\n", " if type(m) == nn.Linear:\n", " if mode == 'normal':\n", " torch.nn.init.xavier_normal_(m.weight, gain)\n", " elif mode == 'uniform':\n", " torch.nn.init.xavier_uniform_(m.weight, gain)\n", " else:\n", " print(\"No specific mode selected. Please choose `normal` or `uniform`\")\n", "\n", " negative_slope = 0.1\n", " actv = f'LeakyReLU({negative_slope})'\n", " set_seed(seed=SEED)\n", " net = Net(actv, 3*32*32, [128, 64, 32], 3).to(DEVICE)\n", " net.apply(init_weights)\n", " criterion = nn.CrossEntropyLoss()\n", "\n", " optimizer = optim.SGD(net.parameters(), lr=1e-2)\n", " train_acc, test_acc = train_test_classification(net, criterion, optimizer,\n", " img_train_loader,\n", " img_test_loader,\n", " num_epochs=1,\n", " verbose=True,\n", " device=DEVICE)\n", " test_accs += [test_acc]\n", " train_accs += [train_acc]" ] }, { "cell_type": "markdown", "metadata": { "execution": {} }, "source": [ "Let's now plot the results!" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": {} }, "outputs": [], "source": [ "# Find the gain that leads to the highest accuracy\n", "best_gain = gains[np.argmax(train_accs)]\n", "\n", "# Calculate the theoretical gain\n", "theoretical_gain = np.sqrt(2.0 / (1 + negative_slope ** 2))\n", "\n", "plt.figure()\n", "plt.plot(gains, test_accs, label='Test accuracy', marker='.', alpha=0.6)\n", "plt.plot(gains, train_accs, label='Train accuracy', marker='.', alpha=0.6)\n", "plt.scatter(best_gain, max(train_accs),\n", " label=f'best gain={best_gain:.2f}',\n", " c='k', marker ='x', linewidths=2)\n", "plt.scatter(theoretical_gain, max(train_accs),\n", " label=f'theoretical gain={theoretical_gain:.2f}',\n", " c='g', marker ='x', linewidths=2)\n", "plt.ylabel('Accuracy (%)')\n", "plt.xlabel('gain')\n", "plt.legend()\n", "plt.show()" ] } ], "metadata": { "colab": { "collapsed_sections": [], "gpuType": "T4", "include_colab_link": true, "name": "W1D3_Tutorial2", "provenance": [], "toc_visible": true }, "kernel": { "display_name": "Python 3", "language": "python", "name": "python3" }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.11" } }, "nbformat": 4, "nbformat_minor": 0 }