{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction to Data Science\n",
    "## Lab 11: Solution to Part B"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Part B: Forward and backward stepwise selection algorithm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Task (5 points)**: Once you've done Part A of this lab, it should be rather easy to implement either the **forward stepwise selection algorithm** or the **backward stepwise selection algorithm** as a function `forwardStepwiseSelection` or `backwardStepwiseSelection`, resp.\n",
    "\n",
    "Start by implementing a function `forwardStepwiseComputation` or `backwardStepwiseComputation` to perform steps 1 and 2 of the respective algorithm.\n",
    "\n",
    "Use 10-fold cross-validation as a measure for model selection (step 3 of the algorithms).\n",
    "Here, you can use the function `cross_val_score` from `sklearn.model_selection`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "nbgrader": {
     "grade": false,
     "grade_id": "cell-d457193eecf936f7",
     "locked": false,
     "schema_version": 3,
     "solution": true,
     "task": false
    }
   },
   "outputs": [],
   "source": [
    "### BEGIN SOLUTION\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.metrics import r2_score\n",
    "from sklearn.model_selection import cross_val_score\n",
    "import numpy as np\n",
    "\n",
    "def forwardStepwiseComputation(X, y, scoring_func = r2_score):\n",
    "    \"\"\" Input: X - predictor array of size (n,p)\n",
    "               y - array of size (n,)\n",
    "               scoring_func - function that takes two arguments y_true\n",
    "                  and y_pred, and returns a score\n",
    "               \n",
    "    \"\"\"\n",
    "    \n",
    "    # Get the number of samples and the number of predictors \n",
    "    n, p = X.shape\n",
    "    \n",
    "    # Prepare lists that keep the best scores and models:\n",
    "    #     best_score[i] keeps the best score in a model i predictors\n",
    "    #     best_model[i] keeps the best model using i predictors\n",
    "    \n",
    "    best_score = []\n",
    "    best_model = []\n",
    "\n",
    "    ### First step in forward stepwise selection algorithm\n",
    "    \n",
    "    # The model containing no predictors simply predicts the sample mean\n",
    "    ybar = y.mean()\n",
    "    yhat = ybar * np.ones_like(y)\n",
    "\n",
    "    best_score.append(scoring_func(y, yhat))\n",
    "    best_model.append( () )\n",
    "    \n",
    "    ### Second step in forward stepwise selection algorithm\n",
    "\n",
    "    # Loop over k - number of predictors in our model\n",
    "    for k in range(1, p+1):\n",
    "\n",
    "        best_model.append( () )\n",
    "        best_score.append(0.)\n",
    "        \n",
    "        for l in set(range(p)) - set(best_model[k-1]):\n",
    "            this_model = best_model[k-1] + (l,)\n",
    "            lr = LinearRegression()\n",
    "            lrfit = lr.fit(X[:,this_model],y)\n",
    "            yhat = lrfit.predict(X[:,this_model])\n",
    "\n",
    "            this_score = scoring_func(y,yhat)\n",
    "            \n",
    "            if this_score > best_score[k]:\n",
    "                best_score[k] = this_score\n",
    "                best_model[k] = this_model\n",
    "                \n",
    "    return (best_model, best_score)\n",
    "\n",
    "\n",
    "def forwardStepwiseSelection(X, y, scoring = 'cv'):\n",
    "    \"\"\" Input: X - predictor array of size (n,p)\n",
    "               y - array of size (n,)\n",
    "               scoring - string variable, one of 'aic', 'bic', 'r2', 'cv'\n",
    "               \n",
    "        Output: bps - best parameter selection\n",
    "    \"\"\"\n",
    "    score = []\n",
    "    def scoring_func(y, yhat, d):\n",
    "        if scoring == 'r2':\n",
    "            return adjustedRSquare(y, yhat, d)\n",
    "        elif scoring == 'cv':\n",
    "            return 0        \n",
    "        else:\n",
    "            shat = sigmaHat(X,y)      \n",
    "            if scoring == 'aic':\n",
    "                return AIC(y, yhat, d, shat)\n",
    "            elif scoring == 'bic':\n",
    "                return BIC(y, yhat, d, shat)\n",
    "            else:\n",
    "                raise NameError('scoring not known')\n",
    "\n",
    "    best_model, best_score = forwardStepwiseComputation(X,y)\n",
    "    print(best_model)\n",
    "    print(best_score)\n",
    "    for d, l in enumerate(best_model):\n",
    "        if scoring == 'cv':\n",
    "            lr = LinearRegression()\n",
    "            if len(l) == 0:\n",
    "                this_score = cross_val_score(lr, np.ones((len(y),1)), y, cv = 10,scoring='r2').mean()\n",
    "            else:\n",
    "                this_score = cross_val_score(lr, X[:,l], y, cv = 10,scoring='r2').mean()\n",
    "            #print(this_score)\n",
    "            score.append( this_score )\n",
    "        else:\n",
    "                \n",
    "            if len(l) == 0:\n",
    "                yhat = y.mean() * np.ones_like(y)\n",
    "            else:\n",
    "                lr = LinearRegression()\n",
    "                lrfit = lr.fit(X[:,l],y)\n",
    "                yhat = lrfit.predict(X[:,l])\n",
    "\n",
    "            score.append( scoring_func(y, yhat, d) )\n",
    "        \n",
    "    if scoring == 'r2' or scoring == 'cv':\n",
    "        best_idx = np.argmax(score)\n",
    "    else:\n",
    "        best_idx = np.argmin(score)\n",
    "        \n",
    "    \n",
    "    return score[best_idx], best_model[best_idx]\n",
    "\n",
    "def backwardStepwiseComputation(X, y, scoring_func = r2_score):\n",
    "    \"\"\" Input: X - predictor array of size (n,p)\n",
    "               y - array of size (n,)\n",
    "               scoring_func - function that takes two arguments y_true\n",
    "                  and y_pred, and returns a score\n",
    "               \n",
    "    \"\"\"\n",
    "    \n",
    "    # Get the number of samples and the number of predictors \n",
    "    n, p = X.shape\n",
    "    \n",
    "    # Prepare lists that keep the best scores and models:\n",
    "    #     best_score[i] keeps the best score in a model i predictors\n",
    "    #     best_model[i] keeps the best model using i predictors\n",
    "    \n",
    "    best_score = []\n",
    "    best_model = []\n",
    "\n",
    "    ### First step in backward stepwise selection algorithm\n",
    "    \n",
    "    # This time we start with the full model\n",
    "    lr = LinearRegression()\n",
    "    lrfit = lr.fit(X,y)\n",
    "    yhat = lrfit.predict(X)\n",
    "    \n",
    "    best_score.append(scoring_func(y,yhat))\n",
    "    best_model.append( tuple(range(p)) )\n",
    "    \n",
    "    ### Second step in best subset selection algorithm\n",
    "\n",
    "    # Loop over k - number of predictors in our model\n",
    "    for k in range(1, p+1):\n",
    "        best_model.append( () )\n",
    "        best_score.append(0.)\n",
    "        \n",
    "        for l in best_model[k-1]:\n",
    "            this_model = tuple(set(best_model[k-1]) - set([l]))\n",
    "            lr = LinearRegression()\n",
    "            if len(this_model) == 0:\n",
    "                yhat = y.mean() * np.ones_like(y)\n",
    "            else:\n",
    "                lrfit = lr.fit(X[:,this_model],y)\n",
    "                yhat = lrfit.predict(X[:,this_model])\n",
    "\n",
    "            this_score = scoring_func(y,yhat)\n",
    "            \n",
    "            if this_score > best_score[k]:\n",
    "                best_score[k] = this_score\n",
    "                best_model[k] = this_model\n",
    "                \n",
    "    return (best_model, best_score)\n",
    "\n",
    "def backwardStepwiseSelection(X, y, scoring = 'cv'):\n",
    "    \"\"\" Input: X - predictor array of size (n,p)\n",
    "               y - array of size (n,)\n",
    "               scoring - string variable, one of 'aic', 'bic', 'r2', 'cv'\n",
    "               \n",
    "        Output: bps - best parameter selection\n",
    "    \"\"\"\n",
    "    score = []\n",
    "    def scoring_func(y, yhat, d):\n",
    "        if scoring == 'r2':\n",
    "            return adjustedRSquare(y, yhat, d)\n",
    "        elif scoring == 'cv':\n",
    "            return 0        \n",
    "        else:\n",
    "            shat = sigmaHat(X,y)      \n",
    "            if scoring == 'aic':\n",
    "                return AIC(y, yhat, d, shat)\n",
    "            elif scoring == 'bic':\n",
    "                return BIC(y, yhat, d, shat)\n",
    "            else:\n",
    "                raise NameError('scoring not known')\n",
    "\n",
    "    best_model, best_score = backwardStepwiseComputation(X,y)\n",
    "    \n",
    "    for d, l in enumerate(best_model):\n",
    "        if scoring == 'cv':\n",
    "            lr = LinearRegression()\n",
    "            if len(l) == 0:\n",
    "                this_score = cross_val_score(lr, np.ones((len(y),1)), y, cv = 10,scoring='r2').mean()\n",
    "            else:\n",
    "                this_score = cross_val_score(lr, X[:,l], y, cv = 10,scoring='r2').mean()\n",
    "            #print(this_score)\n",
    "            score.append( this_score )\n",
    "        else:\n",
    "                \n",
    "            if len(l) == 0:\n",
    "                yhat = y.mean() * np.ones_like(y)\n",
    "            else:\n",
    "                lr = LinearRegression()\n",
    "                lrfit = lr.fit(X[:,l],y)\n",
    "                yhat = lrfit.predict(X[:,l])\n",
    "\n",
    "            score.append( scoring_func(y, yhat, d) )\n",
    "        \n",
    "    if scoring == 'r2' or scoring == 'cv':\n",
    "        best_idx = np.argmax(score)\n",
    "    else:\n",
    "        best_idx = np.argmin(score)\n",
    "    \n",
    "    return score[best_idx], best_model[best_idx]\n",
    "### END SOLUTION"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "nbgrader": {
     "grade": true,
     "grade_id": "cell-6dea1887b538e822",
     "locked": true,
     "points": 5,
     "schema_version": 3,
     "solution": false,
     "task": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(), (2,), (2, 8), (2, 8, 3), (2, 8, 3, 4), (2, 8, 3, 4, 1), (2, 8, 3, 4, 1, 5), (2, 8, 3, 4, 1, 5, 7), (2, 8, 3, 4, 1, 5, 7, 9), (2, 8, 3, 4, 1, 5, 7, 9, 6), (2, 8, 3, 4, 1, 5, 7, 9, 6, 0)]\n",
      "[0.0, 0.3439237602253803, 0.4594852440167805, 0.48008281990946045, 0.49201619828699916, 0.49986101067387156, 0.5148848325387045, 0.5162912373528603, 0.5174713510884762, 0.5177180065591446, 0.5177494254132934]\n",
      "[(), (3,), (3, 4), (3, 4, 6), (3, 4, 6, 9), (3, 4, 6, 9, 5), (3, 4, 6, 9, 5, 8), (3, 4, 6, 9, 5, 8, 2), (3, 4, 6, 9, 5, 8, 2, 0), (3, 4, 6, 9, 5, 8, 2, 0, 7), (3, 4, 6, 9, 5, 8, 2, 0, 7, 1)]\n",
      "[0.0, 0.12090214108658293, 0.13761542573390306, 0.1552917547097522, 0.16818670108557, 0.17633610271942057, 0.18531661113936582, 0.18647395106017728, 0.18748905563843532, 0.18757798393855152, 0.1875879884699032]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.datasets import load_diabetes\n",
    "data = load_diabetes()\n",
    "\n",
    "#print(data.DESCR)\n",
    "X = data.data\n",
    "y = data.target\n",
    "\n",
    "if 'forwardStepwiseSelection' in locals():\n",
    "    fSS_score, fSS_model = forwardStepwiseSelection(X,y)\n",
    "elif 'backwardStepwiseSelection' in locals():\n",
    "    fBB_score, fBB_model = backwardStepwiseSelection(X,y)\n",
    "else: assert False, 'Implement the backwardStepwiseSelection or the forwardStepwiseSelection'\n",
    "\n",
    "assert 'fSS_score' in locals() and abs(fSS_score - 0.4723839929236253) < 1e-8 or 'fBB_score' in locals() and abs(fBB_score - 0.4723839929236256) < 1e-8\n",
    "\n",
    "if 'forwardStepwiseSelection' in locals():\n",
    "    np.random.seed(0)\n",
    "    fSS_score, fSS_model = forwardStepwiseSelection(np.random.randn(100,10),np.random.randn(100,))\n",
    "    assert abs(fSS_score + 0.11591245146628207) < 1e-6\n",
    "    assert fSS_model[1] == 4\n",
    "elif 'backwardStepwiseSelection' in locals():\n",
    "    np.random.seed(1)\n",
    "    bSS_score, bSS_model = backwardStepwiseSelection(np.random.randn(100,10),np.random.randn(100,))\n",
    "    assert abs(bSS_score + 0.17226717542216358) < 1e-6\n",
    "    assert bSS_model[1] == 3\n",
    "    \n",
    "else:\n",
    "    assert False, 'Implement the backwardStepwiseSelection or the forwardStepwiseSelection'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "celltoolbar": "Create Assignment",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}