{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction to Data Science\n", "## Lab 11: Solution to Part B" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Part B: Forward and backward stepwise selection algorithm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Task (5 points)**: Once you've done Part A of this lab, it should be rather easy to implement either the **forward stepwise selection algorithm** or the **backward stepwise selection algorithm** as a function `forwardStepwiseSelection` or `backwardStepwiseSelection`, resp.\n", "\n", "Start by implementing a function `forwardStepwiseComputation` or `backwardStepwiseComputation` to perform steps 1 and 2 of the respective algorithm.\n", "\n", "Use 10-fold cross-validation as a measure for model selection (step 3 of the algorithms).\n", "Here, you can use the function `cross_val_score` from `sklearn.model_selection`." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "nbgrader": { "grade": false, "grade_id": "cell-d457193eecf936f7", "locked": false, "schema_version": 3, "solution": true, "task": false } }, "outputs": [], "source": [ "### BEGIN SOLUTION\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.metrics import r2_score\n", "from sklearn.model_selection import cross_val_score\n", "import numpy as np\n", "\n", "def forwardStepwiseComputation(X, y, scoring_func = r2_score):\n", " \"\"\" Input: X - predictor array of size (n,p)\n", " y - array of size (n,)\n", " scoring_func - function that takes two arguments y_true\n", " and y_pred, and returns a score\n", " \n", " \"\"\"\n", " \n", " # Get the number of samples and the number of predictors \n", " n, p = X.shape\n", " \n", " # Prepare lists that keep the best scores and models:\n", " # best_score[i] keeps the best score in a model i predictors\n", " # best_model[i] keeps the best model using i predictors\n", " \n", " best_score = []\n", " best_model = []\n", "\n", " ### First step in forward stepwise selection algorithm\n", " \n", " # The model containing no predictors simply predicts the sample mean\n", " ybar = y.mean()\n", " yhat = ybar * np.ones_like(y)\n", "\n", " best_score.append(scoring_func(y, yhat))\n", " best_model.append( () )\n", " \n", " ### Second step in forward stepwise selection algorithm\n", "\n", " # Loop over k - number of predictors in our model\n", " for k in range(1, p+1):\n", "\n", " best_model.append( () )\n", " best_score.append(0.)\n", " \n", " for l in set(range(p)) - set(best_model[k-1]):\n", " this_model = best_model[k-1] + (l,)\n", " lr = LinearRegression()\n", " lrfit = lr.fit(X[:,this_model],y)\n", " yhat = lrfit.predict(X[:,this_model])\n", "\n", " this_score = scoring_func(y,yhat)\n", " \n", " if this_score > best_score[k]:\n", " best_score[k] = this_score\n", " best_model[k] = this_model\n", " \n", " return (best_model, best_score)\n", "\n", "\n", "def forwardStepwiseSelection(X, y, scoring = 'cv'):\n", " \"\"\" Input: X - predictor array of size (n,p)\n", " y - array of size (n,)\n", " scoring - string variable, one of 'aic', 'bic', 'r2', 'cv'\n", " \n", " Output: bps - best parameter selection\n", " \"\"\"\n", " score = []\n", " def scoring_func(y, yhat, d):\n", " if scoring == 'r2':\n", " return adjustedRSquare(y, yhat, d)\n", " elif scoring == 'cv':\n", " return 0 \n", " else:\n", " shat = sigmaHat(X,y) \n", " if scoring == 'aic':\n", " return AIC(y, yhat, d, shat)\n", " elif scoring == 'bic':\n", " return BIC(y, yhat, d, shat)\n", " else:\n", " raise NameError('scoring not known')\n", "\n", " best_model, best_score = forwardStepwiseComputation(X,y)\n", " print(best_model)\n", " print(best_score)\n", " for d, l in enumerate(best_model):\n", " if scoring == 'cv':\n", " lr = LinearRegression()\n", " if len(l) == 0:\n", " this_score = cross_val_score(lr, np.ones((len(y),1)), y, cv = 10,scoring='r2').mean()\n", " else:\n", " this_score = cross_val_score(lr, X[:,l], y, cv = 10,scoring='r2').mean()\n", " #print(this_score)\n", " score.append( this_score )\n", " else:\n", " \n", " if len(l) == 0:\n", " yhat = y.mean() * np.ones_like(y)\n", " else:\n", " lr = LinearRegression()\n", " lrfit = lr.fit(X[:,l],y)\n", " yhat = lrfit.predict(X[:,l])\n", "\n", " score.append( scoring_func(y, yhat, d) )\n", " \n", " if scoring == 'r2' or scoring == 'cv':\n", " best_idx = np.argmax(score)\n", " else:\n", " best_idx = np.argmin(score)\n", " \n", " \n", " return score[best_idx], best_model[best_idx]\n", "\n", "def backwardStepwiseComputation(X, y, scoring_func = r2_score):\n", " \"\"\" Input: X - predictor array of size (n,p)\n", " y - array of size (n,)\n", " scoring_func - function that takes two arguments y_true\n", " and y_pred, and returns a score\n", " \n", " \"\"\"\n", " \n", " # Get the number of samples and the number of predictors \n", " n, p = X.shape\n", " \n", " # Prepare lists that keep the best scores and models:\n", " # best_score[i] keeps the best score in a model i predictors\n", " # best_model[i] keeps the best model using i predictors\n", " \n", " best_score = []\n", " best_model = []\n", "\n", " ### First step in backward stepwise selection algorithm\n", " \n", " # This time we start with the full model\n", " lr = LinearRegression()\n", " lrfit = lr.fit(X,y)\n", " yhat = lrfit.predict(X)\n", " \n", " best_score.append(scoring_func(y,yhat))\n", " best_model.append( tuple(range(p)) )\n", " \n", " ### Second step in best subset selection algorithm\n", "\n", " # Loop over k - number of predictors in our model\n", " for k in range(1, p+1):\n", " best_model.append( () )\n", " best_score.append(0.)\n", " \n", " for l in best_model[k-1]:\n", " this_model = tuple(set(best_model[k-1]) - set([l]))\n", " lr = LinearRegression()\n", " if len(this_model) == 0:\n", " yhat = y.mean() * np.ones_like(y)\n", " else:\n", " lrfit = lr.fit(X[:,this_model],y)\n", " yhat = lrfit.predict(X[:,this_model])\n", "\n", " this_score = scoring_func(y,yhat)\n", " \n", " if this_score > best_score[k]:\n", " best_score[k] = this_score\n", " best_model[k] = this_model\n", " \n", " return (best_model, best_score)\n", "\n", "def backwardStepwiseSelection(X, y, scoring = 'cv'):\n", " \"\"\" Input: X - predictor array of size (n,p)\n", " y - array of size (n,)\n", " scoring - string variable, one of 'aic', 'bic', 'r2', 'cv'\n", " \n", " Output: bps - best parameter selection\n", " \"\"\"\n", " score = []\n", " def scoring_func(y, yhat, d):\n", " if scoring == 'r2':\n", " return adjustedRSquare(y, yhat, d)\n", " elif scoring == 'cv':\n", " return 0 \n", " else:\n", " shat = sigmaHat(X,y) \n", " if scoring == 'aic':\n", " return AIC(y, yhat, d, shat)\n", " elif scoring == 'bic':\n", " return BIC(y, yhat, d, shat)\n", " else:\n", " raise NameError('scoring not known')\n", "\n", " best_model, best_score = backwardStepwiseComputation(X,y)\n", " \n", " for d, l in enumerate(best_model):\n", " if scoring == 'cv':\n", " lr = LinearRegression()\n", " if len(l) == 0:\n", " this_score = cross_val_score(lr, np.ones((len(y),1)), y, cv = 10,scoring='r2').mean()\n", " else:\n", " this_score = cross_val_score(lr, X[:,l], y, cv = 10,scoring='r2').mean()\n", " #print(this_score)\n", " score.append( this_score )\n", " else:\n", " \n", " if len(l) == 0:\n", " yhat = y.mean() * np.ones_like(y)\n", " else:\n", " lr = LinearRegression()\n", " lrfit = lr.fit(X[:,l],y)\n", " yhat = lrfit.predict(X[:,l])\n", "\n", " score.append( scoring_func(y, yhat, d) )\n", " \n", " if scoring == 'r2' or scoring == 'cv':\n", " best_idx = np.argmax(score)\n", " else:\n", " best_idx = np.argmin(score)\n", " \n", " return score[best_idx], best_model[best_idx]\n", "### END SOLUTION" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "nbgrader": { "grade": true, "grade_id": "cell-6dea1887b538e822", "locked": true, "points": 5, "schema_version": 3, "solution": false, "task": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(), (2,), (2, 8), (2, 8, 3), (2, 8, 3, 4), (2, 8, 3, 4, 1), (2, 8, 3, 4, 1, 5), (2, 8, 3, 4, 1, 5, 7), (2, 8, 3, 4, 1, 5, 7, 9), (2, 8, 3, 4, 1, 5, 7, 9, 6), (2, 8, 3, 4, 1, 5, 7, 9, 6, 0)]\n", "[0.0, 0.3439237602253803, 0.4594852440167805, 0.48008281990946045, 0.49201619828699916, 0.49986101067387156, 0.5148848325387045, 0.5162912373528603, 0.5174713510884762, 0.5177180065591446, 0.5177494254132934]\n", "[(), (3,), (3, 4), (3, 4, 6), (3, 4, 6, 9), (3, 4, 6, 9, 5), (3, 4, 6, 9, 5, 8), (3, 4, 6, 9, 5, 8, 2), (3, 4, 6, 9, 5, 8, 2, 0), (3, 4, 6, 9, 5, 8, 2, 0, 7), (3, 4, 6, 9, 5, 8, 2, 0, 7, 1)]\n", "[0.0, 0.12090214108658293, 0.13761542573390306, 0.1552917547097522, 0.16818670108557, 0.17633610271942057, 0.18531661113936582, 0.18647395106017728, 0.18748905563843532, 0.18757798393855152, 0.1875879884699032]\n" ] } ], "source": [ "from sklearn.datasets import load_diabetes\n", "data = load_diabetes()\n", "\n", "#print(data.DESCR)\n", "X = data.data\n", "y = data.target\n", "\n", "if 'forwardStepwiseSelection' in locals():\n", " fSS_score, fSS_model = forwardStepwiseSelection(X,y)\n", "elif 'backwardStepwiseSelection' in locals():\n", " fBB_score, fBB_model = backwardStepwiseSelection(X,y)\n", "else: assert False, 'Implement the backwardStepwiseSelection or the forwardStepwiseSelection'\n", "\n", "assert 'fSS_score' in locals() and abs(fSS_score - 0.4723839929236253) < 1e-8 or 'fBB_score' in locals() and abs(fBB_score - 0.4723839929236256) < 1e-8\n", "\n", "if 'forwardStepwiseSelection' in locals():\n", " np.random.seed(0)\n", " fSS_score, fSS_model = forwardStepwiseSelection(np.random.randn(100,10),np.random.randn(100,))\n", " assert abs(fSS_score + 0.11591245146628207) < 1e-6\n", " assert fSS_model[1] == 4\n", "elif 'backwardStepwiseSelection' in locals():\n", " np.random.seed(1)\n", " bSS_score, bSS_model = backwardStepwiseSelection(np.random.randn(100,10),np.random.randn(100,))\n", " assert abs(bSS_score + 0.17226717542216358) < 1e-6\n", " assert bSS_model[1] == 3\n", " \n", "else:\n", " assert False, 'Implement the backwardStepwiseSelection or the forwardStepwiseSelection'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "celltoolbar": "Create Assignment", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }