Introduction

import pandas as pd
pd.options.display.max_columns = 999
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.model_selection import KFold
df = pd.read_csv("AmesHousing.tsv", delimiter="\t")
def transform_features(df):
    return df

def select_features(df):
    return df[["Gr Liv Area", "SalePrice"]]

def train_and_test(df):  
    train = df[:1460]
    test = df[1460:]
    
    ## You can use `pd.DataFrame.select_dtypes()` to specify column types
    ## and return only those columns as a data frame.
    numeric_train = train.select_dtypes(include=['integer', 'float'])
    numeric_test = test.select_dtypes(include=['integer', 'float'])
    
    ## You can use `pd.Series.drop()` to drop a value.
    features = numeric_train.columns.drop("SalePrice")
    lr = linear_model.LinearRegression()
    lr.fit(train[features], train["SalePrice"])
    predictions = lr.predict(test[features])
    mse = mean_squared_error(test["SalePrice"], predictions)
    rmse = np.sqrt(mse)
    
    return rmse

transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df)

rmse
57088.251612639091

Feature Engineering

Handle missing values:
All columns:
Drop any with 5% or more missing values for now.
Text columns:
Drop any with 1 or more missing values for now.
Numerical columns:
For columns with missing values, fill in with the most common value in that column

1: All columns: Drop any with 5% or more missing values for now.

## Series object: column name -> number of missing values
num_missing = df.isnull().sum()
# Filter Series to columns containing >5% missing values
drop_missing_cols = num_missing[(num_missing > len(df)/20)].sort_values()

# Drop those columns from the data frame. Note the use of the .index accessor
df = df.drop(drop_missing_cols.index, axis=1)
## Series object: column name -> number of missing values
text_mv_counts = df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)

## Filter Series to columns containing *any* missing values
drop_missing_cols_2 = text_mv_counts[text_mv_counts > 0]

df = df.drop(drop_missing_cols_2.index, axis=1)
## Compute column-wise missing value counts
num_missing = df.select_dtypes(include=['int', 'float']).isnull().sum()
fixable_numeric_cols = num_missing[(num_missing < len(df)/20) & (num_missing > 0)].sort_values()
fixable_numeric_cols
BsmtFin SF 1       1
BsmtFin SF 2       1
Bsmt Unf SF        1
Total Bsmt SF      1
Garage Cars        1
Garage Area        1
Bsmt Full Bath     2
Bsmt Half Bath     2
Mas Vnr Area      23
dtype: int64
## Compute the most common value for each column in `fixable_nmeric_missing_cols`.
replacement_values_dict = df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]
replacement_values_dict
{'Bsmt Full Bath': 0.0,
 'Bsmt Half Bath': 0.0,
 'Bsmt Unf SF': 0.0,
 'BsmtFin SF 1': 0.0,
 'BsmtFin SF 2': 0.0,
 'Garage Area': 0.0,
 'Garage Cars': 2.0,
 'Mas Vnr Area': 0.0,
 'Total Bsmt SF': 0.0}
## Use `pd.DataFrame.fillna()` to replace missing values.
df = df.fillna(replacement_values_dict)
## Verify that every column has 0 missing values
df.isnull().sum().value_counts()
0    64
dtype: int64
years_sold = df['Yr Sold'] - df['Year Built']
years_sold[years_sold < 0]
2180   -1
dtype: int64
years_since_remod = df['Yr Sold'] - df['Year Remod/Add']
years_since_remod[years_since_remod < 0]
1702   -1
2180   -2
2181   -1
dtype: int64
## Create new columns
df['Years Before Sale'] = years_sold
df['Years Since Remod'] = years_since_remod

## Drop rows with negative values for both of these new features
df = df.drop([1702, 2180, 2181], axis=0)

## No longer need original year columns
df = df.drop(["Year Built", "Year Remod/Add"], axis = 1)

Drop columns that:
a. that aren't useful for ML
b. leak data about the final sale

## Drop columns that aren't useful for ML
df = df.drop(["PID", "Order"], axis=1)

## Drop columns that leak info about the final sale
df = df.drop(["Mo Sold", "Sale Condition", "Sale Type", "Yr Sold"], axis=1)

Let's update transform_features()

def transform_features(df):
    num_missing = df.isnull().sum()
    drop_missing_cols = num_missing[(num_missing > len(df)/20)].sort_values()
    df = df.drop(drop_missing_cols.index, axis=1)
    
    text_mv_counts = df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)
    drop_missing_cols_2 = text_mv_counts[text_mv_counts > 0]
    df = df.drop(drop_missing_cols_2.index, axis=1)
    
    num_missing = df.select_dtypes(include=['int', 'float']).isnull().sum()
    fixable_numeric_cols = num_missing[(num_missing < len(df)/20) & (num_missing > 0)].sort_values()
    replacement_values_dict = df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]
    df = df.fillna(replacement_values_dict)
    
    years_sold = df['Yr Sold'] - df['Year Built']
    years_since_remod = df['Yr Sold'] - df['Year Remod/Add']
    df['Years Before Sale'] = years_sold
    df['Years Since Remod'] = years_since_remod
    df = df.drop([1702, 2180, 2181], axis=0)

    df = df.drop(["PID", "Order", "Mo Sold", "Sale Condition", "Sale Type", "Year Built", "Year Remod/Add"], axis=1)
    return df

def select_features(df):
    return df[["Gr Liv Area", "SalePrice"]]

def train_and_test(df):  
    train = df[:1460]
    test = df[1460:]
    
    ## You can use `pd.DataFrame.select_dtypes()` to specify column types
    ## and return only those columns as a data frame.
    numeric_train = train.select_dtypes(include=['integer', 'float'])
    numeric_test = test.select_dtypes(include=['integer', 'float'])
    
    ## You can use `pd.Series.drop()` to drop a value.
    features = numeric_train.columns.drop("SalePrice")
    lr = linear_model.LinearRegression()
    lr.fit(train[features], train["SalePrice"])
    predictions = lr.predict(test[features])
    mse = mean_squared_error(test["SalePrice"], predictions)
    rmse = np.sqrt(mse)
    
    return rmse

df = pd.read_csv("AmesHousing.tsv", delimiter="\t")
transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df)

rmse
55275.367312413066

Feature Selection

numerical_df = transform_df.select_dtypes(include=['int', 'float'])
numerical_df
MS SubClass Lot Area Overall Qual Overall Cond Mas Vnr Area BsmtFin SF 1 BsmtFin SF 2 Bsmt Unf SF Total Bsmt SF 1st Flr SF 2nd Flr SF Low Qual Fin SF Gr Liv Area Bsmt Full Bath Bsmt Half Bath Full Bath Half Bath Bedroom AbvGr Kitchen AbvGr TotRms AbvGrd Fireplaces Garage Cars Garage Area Wood Deck SF Open Porch SF Enclosed Porch 3Ssn Porch Screen Porch Pool Area Misc Val Yr Sold SalePrice Years Before Sale Years Since Remod
0 20 31770 6 5 112.0 639.0 0.0 441.0 1080.0 1656 0 0 1656 1.0 0.0 1 0 3 1 7 2 2.0 528.0 210 62 0 0 0 0 0 2010 215000 50 50
1 20 11622 5 6 0.0 468.0 144.0 270.0 882.0 896 0 0 896 0.0 0.0 1 0 2 1 5 0 1.0 730.0 140 0 0 0 120 0 0 2010 105000 49 49
2 20 14267 6 6 108.0 923.0 0.0 406.0 1329.0 1329 0 0 1329 0.0 0.0 1 1 3 1 6 0 1.0 312.0 393 36 0 0 0 0 12500 2010 172000 52 52
3 20 11160 7 5 0.0 1065.0 0.0 1045.0 2110.0 2110 0 0 2110 1.0 0.0 2 1 3 1 8 2 2.0 522.0 0 0 0 0 0 0 0 2010 244000 42 42
4 60 13830 5 5 0.0 791.0 0.0 137.0 928.0 928 701 0 1629 0.0 0.0 2 1 3 1 6 1 2.0 482.0 212 34 0 0 0 0 0 2010 189900 13 12
5 60 9978 6 6 20.0 602.0 0.0 324.0 926.0 926 678 0 1604 0.0 0.0 2 1 3 1 7 1 2.0 470.0 360 36 0 0 0 0 0 2010 195500 12 12
6 120 4920 8 5 0.0 616.0 0.0 722.0 1338.0 1338 0 0 1338 1.0 0.0 2 0 2 1 6 0 2.0 582.0 0 0 170 0 0 0 0 2010 213500 9 9
7 120 5005 8 5 0.0 263.0 0.0 1017.0 1280.0 1280 0 0 1280 0.0 0.0 2 0 2 1 5 0 2.0 506.0 0 82 0 0 144 0 0 2010 191500 18 18
8 120 5389 8 5 0.0 1180.0 0.0 415.0 1595.0 1616 0 0 1616 1.0 0.0 2 0 2 1 5 1 2.0 608.0 237 152 0 0 0 0 0 2010 236500 15 14
9 60 7500 7 5 0.0 0.0 0.0 994.0 994.0 1028 776 0 1804 0.0 0.0 2 1 3 1 7 1 2.0 442.0 140 60 0 0 0 0 0 2010 189000 11 11
10 60 10000 6 5 0.0 0.0 0.0 763.0 763.0 763 892 0 1655 0.0 0.0 2 1 3 1 7 1 2.0 440.0 157 84 0 0 0 0 0 2010 175900 17 16
11 20 7980 6 7 0.0 935.0 0.0 233.0 1168.0 1187 0 0 1187 1.0 0.0 2 0 3 1 6 0 2.0 420.0 483 21 0 0 0 0 500 2010 185000 18 3
12 60 8402 6 5 0.0 0.0 0.0 789.0 789.0 789 676 0 1465 0.0 0.0 2 1 3 1 7 1 2.0 393.0 0 75 0 0 0 0 0 2010 180400 12 12
13 20 10176 7 5 0.0 637.0 0.0 663.0 1300.0 1341 0 0 1341 1.0 0.0 1 1 2 1 5 1 2.0 506.0 192 0 0 0 0 0 0 2010 171500 20 20
14 120 6820 8 5 0.0 368.0 1120.0 0.0 1488.0 1502 0 0 1502 1.0 0.0 1 1 1 1 4 0 2.0 528.0 0 54 0 0 140 0 0 2010 212000 25 25
15 60 53504 8 5 603.0 1416.0 0.0 234.0 1650.0 1690 1589 0 3279 1.0 0.0 3 1 4 1 12 1 3.0 841.0 503 36 0 0 210 0 0 2010 538000 7 7
16 50 12134 8 7 0.0 427.0 0.0 132.0 559.0 1080 672 0 1752 0.0 0.0 2 0 4 1 8 0 2.0 492.0 325 12 0 0 0 0 0 2010 164000 22 5
17 20 11394 9 2 350.0 1445.0 0.0 411.0 1856.0 1856 0 0 1856 1.0 0.0 1 1 1 1 8 1 3.0 834.0 113 0 0 0 0 0 0 2010 394432 0 0
18 20 19138 4 5 0.0 120.0 0.0 744.0 864.0 864 0 0 864 0.0 0.0 1 0 2 1 4 0 2.0 400.0 0 0 0 0 0 0 0 2010 141000 59 59
19 20 13175 6 6 119.0 790.0 163.0 589.0 1542.0 2073 0 0 2073 1.0 0.0 2 0 3 1 7 2 2.0 500.0 349 0 0 0 0 0 0 2010 210000 32 22
20 20 11751 6 6 480.0 705.0 0.0 1139.0 1844.0 1844 0 0 1844 0.0 0.0 2 0 3 1 7 1 2.0 546.0 0 122 0 0 0 0 0 2010 190000 33 33
21 85 10625 7 6 81.0 885.0 168.0 0.0 1053.0 1173 0 0 1173 1.0 0.0 2 0 3 1 6 2 2.0 528.0 0 120 0 0 0 0 0 2010 170000 36 36
22 60 7500 7 5 0.0 533.0 0.0 281.0 814.0 814 860 0 1674 1.0 0.0 2 1 3 1 7 0 2.0 663.0 0 96 0 0 0 0 0 2010 216000 10 10
23 20 11241 6 7 180.0 578.0 0.0 426.0 1004.0 1004 0 0 1004 1.0 0.0 1 0 2 1 5 1 2.0 480.0 0 0 0 0 0 0 700 2010 149000 40 40
24 20 12537 5 6 0.0 734.0 0.0 344.0 1078.0 1078 0 0 1078 1.0 0.0 1 1 3 1 6 1 2.0 500.0 0 0 0 0 0 0 0 2010 149900 39 2
25 20 8450 5 6 0.0 775.0 0.0 281.0 1056.0 1056 0 0 1056 1.0 0.0 1 0 3 1 6 1 1.0 304.0 0 85 184 0 0 0 0 2010 142000 42 42
26 20 8400 4 5 0.0 804.0 78.0 0.0 882.0 882 0 0 882 1.0 0.0 1 0 2 1 4 0 2.0 525.0 240 0 0 0 0 0 0 2010 126000 40 40
27 20 10500 4 5 0.0 432.0 0.0 432.0 864.0 864 0 0 864 0.0 0.0 1 0 3 1 5 1 0.0 0.0 0 0 0 0 0 0 0 2010 115000 39 39
28 120 5858 7 5 0.0 1051.0 0.0 354.0 1405.0 1337 0 0 1337 1.0 0.0 2 0 2 1 5 1 2.0 511.0 203 68 0 0 0 0 0 2010 184000 11 11
29 160 1680 6 5 504.0 156.0 0.0 327.0 483.0 483 504 0 987 0.0 0.0 1 1 2 1 5 0 1.0 264.0 275 0 0 0 0 0 0 2010 96000 39 39
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2900 20 13618 8 5 198.0 1350.0 0.0 378.0 1728.0 1960 0 0 1960 1.0 0.0 2 0 3 1 8 2 3.0 714.0 172 38 0 0 0 0 0 2006 320000 1 0
2901 20 11443 8 5 208.0 1460.0 0.0 408.0 1868.0 2028 0 0 2028 1.0 0.0 2 0 2 1 7 2 3.0 880.0 326 66 0 0 0 0 0 2006 369900 1 0
2902 20 11577 9 5 382.0 1455.0 0.0 383.0 1838.0 1838 0 0 1838 1.0 0.0 2 0 3 1 9 1 3.0 682.0 161 225 0 0 0 0 0 2006 359900 1 0
2903 20 31250 1 3 0.0 0.0 0.0 0.0 0.0 1600 0 0 1600 0.0 0.0 1 1 3 1 6 0 1.0 270.0 0 0 135 0 0 0 0 2006 81500 55 55
2904 90 7020 7 5 200.0 1243.0 0.0 45.0 1288.0 1368 0 0 1368 2.0 0.0 2 0 2 2 8 0 4.0 784.0 0 48 0 0 0 0 0 2006 215000 9 9
2905 120 4500 6 5 116.0 897.0 0.0 319.0 1216.0 1216 0 0 1216 1.0 0.0 2 0 2 1 5 0 2.0 402.0 0 125 0 0 0 0 0 2006 164000 8 8
2906 120 4500 6 5 443.0 1201.0 0.0 36.0 1237.0 1337 0 0 1337 1.0 0.0 2 0 2 1 5 0 2.0 405.0 0 199 0 0 0 0 0 2006 153500 8 8
2907 20 17217 5 5 0.0 0.0 0.0 1140.0 1140.0 1140 0 0 1140 0.0 0.0 1 0 3 1 6 0 0.0 0.0 36 56 0 0 0 0 0 2006 84500 0 0
2908 160 2665 5 6 0.0 0.0 0.0 264.0 264.0 616 688 0 1304 0.0 0.0 1 1 3 1 5 1 1.0 336.0 141 0 0 0 0 0 0 2006 104500 29 29
2909 160 2665 5 6 0.0 548.0 173.0 36.0 757.0 925 550 0 1475 0.0 0.0 2 0 4 1 6 1 1.0 336.0 104 26 0 0 0 0 0 2006 127000 29 29
2910 160 3964 6 4 0.0 837.0 0.0 105.0 942.0 1291 1230 0 2521 1.0 0.0 2 1 5 1 10 1 2.0 576.0 728 20 0 0 0 0 0 2006 151400 33 33
2911 20 10172 5 7 0.0 441.0 0.0 423.0 864.0 874 0 0 874 1.0 0.0 1 0 3 1 5 0 1.0 288.0 0 120 0 0 0 0 0 2006 126500 38 3
2912 90 11836 5 5 0.0 149.0 0.0 1503.0 1652.0 1652 0 0 1652 0.0 0.0 2 0 4 2 8 0 3.0 928.0 0 0 0 0 0 0 0 2006 146500 36 36
2913 180 1470 4 6 0.0 522.0 0.0 108.0 630.0 630 0 0 630 1.0 0.0 1 0 1 1 3 0 0.0 0.0 0 0 0 0 0 0 0 2006 73000 36 36
2914 160 1484 4 4 0.0 252.0 0.0 294.0 546.0 546 546 0 1092 0.0 0.0 1 1 3 1 5 0 1.0 253.0 0 0 0 0 0 0 0 2006 79400 34 34
2915 20 13384 5 5 194.0 119.0 344.0 641.0 1104.0 1360 0 0 1360 1.0 0.0 1 0 3 1 8 1 1.0 336.0 160 0 0 0 0 0 0 2006 140000 37 27
2916 180 1533 5 7 0.0 553.0 0.0 77.0 630.0 630 0 0 630 1.0 0.0 1 0 1 1 3 0 0.0 0.0 0 0 0 0 0 0 0 2006 92000 36 36
2917 160 1533 4 5 0.0 408.0 0.0 138.0 546.0 546 546 0 1092 0.0 0.0 1 1 3 1 5 0 1.0 286.0 0 0 0 0 0 0 0 2006 87550 36 36
2918 160 1526 4 5 0.0 0.0 0.0 546.0 546.0 546 546 0 1092 0.0 0.0 1 1 3 1 5 0 0.0 0.0 0 34 0 0 0 0 0 2006 79500 36 36
2919 160 1936 4 7 0.0 0.0 0.0 546.0 546.0 546 546 0 1092 0.0 0.0 1 1 3 1 5 0 0.0 0.0 0 0 0 0 0 0 0 2006 90500 36 36
2920 160 1894 4 5 0.0 252.0 0.0 294.0 546.0 546 546 0 1092 0.0 0.0 1 1 3 1 6 0 1.0 286.0 0 24 0 0 0 0 0 2006 71000 36 36
2921 90 12640 6 5 0.0 936.0 396.0 396.0 1728.0 1728 0 0 1728 0.0 0.0 2 0 4 2 8 0 2.0 574.0 40 0 0 0 0 0 0 2006 150900 30 30
2922 90 9297 5 5 0.0 1606.0 0.0 122.0 1728.0 1728 0 0 1728 2.0 0.0 2 0 4 2 8 0 2.0 560.0 0 0 0 0 0 0 0 2006 188000 30 30
2923 20 17400 5 5 0.0 936.0 0.0 190.0 1126.0 1126 0 0 1126 1.0 0.0 2 0 3 1 5 1 2.0 484.0 295 41 0 0 0 0 0 2006 160000 29 29
2924 20 20000 5 7 0.0 1224.0 0.0 0.0 1224.0 1224 0 0 1224 1.0 0.0 1 0 4 1 7 1 2.0 576.0 474 0 0 0 0 0 0 2006 131000 46 10
2925 80 7937 6 6 0.0 819.0 0.0 184.0 1003.0 1003 0 0 1003 1.0 0.0 1 0 3 1 6 0 2.0 588.0 120 0 0 0 0 0 0 2006 142500 22 22
2926 20 8885 5 5 0.0 301.0 324.0 239.0 864.0 902 0 0 902 1.0 0.0 1 0 2 1 5 0 2.0 484.0 164 0 0 0 0 0 0 2006 131000 23 23
2927 85 10441 5 5 0.0 337.0 0.0 575.0 912.0 970 0 0 970 0.0 1.0 1 0 3 1 6 0 0.0 0.0 80 32 0 0 0 0 700 2006 132000 14 14
2928 20 10010 5 5 0.0 1071.0 123.0 195.0 1389.0 1389 0 0 1389 1.0 0.0 1 0 2 1 6 1 2.0 418.0 240 38 0 0 0 0 0 2006 170000 32 31
2929 60 9627 7 5 94.0 758.0 0.0 238.0 996.0 996 1004 0 2000 0.0 0.0 2 1 3 1 9 1 3.0 650.0 190 48 0 0 0 0 0 2006 188000 13 12

2927 rows × 34 columns

abs_corr_coeffs = numerical_df.corr()['SalePrice'].abs().sort_values()
abs_corr_coeffs
BsmtFin SF 2         0.006127
Misc Val             0.019273
Yr Sold              0.030358
3Ssn Porch           0.032268
Bsmt Half Bath       0.035875
Low Qual Fin SF      0.037629
Pool Area            0.068438
MS SubClass          0.085128
Overall Cond         0.101540
Screen Porch         0.112280
Kitchen AbvGr        0.119760
Enclosed Porch       0.128685
Bedroom AbvGr        0.143916
Bsmt Unf SF          0.182751
Lot Area             0.267520
2nd Flr SF           0.269601
Bsmt Full Bath       0.276258
Half Bath            0.284871
Open Porch SF        0.316262
Wood Deck SF         0.328183
BsmtFin SF 1         0.439284
Fireplaces           0.474831
TotRms AbvGrd        0.498574
Mas Vnr Area         0.506983
Years Since Remod    0.534985
Full Bath            0.546118
Years Before Sale    0.558979
1st Flr SF           0.635185
Garage Area          0.641425
Total Bsmt SF        0.644012
Garage Cars          0.648361
Gr Liv Area          0.717596
Overall Qual         0.801206
SalePrice            1.000000
Name: SalePrice, dtype: float64
## Let's only keep columns with a correlation coefficient of larger than 0.4 (arbitrary, worth experimenting later!)
abs_corr_coeffs[abs_corr_coeffs > 0.4]
BsmtFin SF 1         0.439284
Fireplaces           0.474831
TotRms AbvGrd        0.498574
Mas Vnr Area         0.506983
Years Since Remod    0.534985
Full Bath            0.546118
Years Before Sale    0.558979
1st Flr SF           0.635185
Garage Area          0.641425
Total Bsmt SF        0.644012
Garage Cars          0.648361
Gr Liv Area          0.717596
Overall Qual         0.801206
SalePrice            1.000000
Name: SalePrice, dtype: float64
## Drop columns with less than 0.4 correlation with SalePrice
transform_df = transform_df.drop(abs_corr_coeffs[abs_corr_coeffs < 0.4].index, axis=1)

Which categorical columns should we keep?

## Create a list of column names from documentation that are *meant* to be categorical
nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                    "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                    "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                    "Misc Feature", "Sale Type", "Sale Condition"]

Which columns are currently numerical but need to be encoded as categorical instead (because the numbers don't have any semantic meaning)?
If a categorical column has hundreds of unique values (or categories), should we keep it? When we dummy code this column, hundreds of columns will need to be added back to the data frame.

## Which categorical columns have we still carried with us? We'll test tehse 
transform_cat_cols = []
for col in nominal_features:
    if col in transform_df.columns:
        transform_cat_cols.append(col)

## How many unique values in each categorical column?
uniqueness_counts = transform_df[transform_cat_cols].apply(lambda col: len(col.value_counts())).sort_values()
## Aribtrary cutoff of 10 unique values (worth experimenting)
drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > 10].index
transform_df = transform_df.drop(drop_nonuniq_cols, axis=1)
## Select just the remaining text columns and convert to categorical
text_cols = transform_df.select_dtypes(include=['object'])
for col in text_cols:
    transform_df[col] = transform_df[col].astype('category')
    
## Create dummy columns and add back to the dataframe!
transform_df = pd.concat([
    transform_df, 
    pd.get_dummies(transform_df.select_dtypes(include=['category']))
], axis=1)

Update select_features()

def transform_features(df):
    num_missing = df.isnull().sum()
    drop_missing_cols = num_missing[(num_missing > len(df)/20)].sort_values()
    df = df.drop(drop_missing_cols.index, axis=1)
    
    text_mv_counts = df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)
    drop_missing_cols_2 = text_mv_counts[text_mv_counts > 0]
    df = df.drop(drop_missing_cols_2.index, axis=1)
    
    num_missing = df.select_dtypes(include=['int', 'float']).isnull().sum()
    fixable_numeric_cols = num_missing[(num_missing < len(df)/20) & (num_missing > 0)].sort_values()
    replacement_values_dict = df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]
    df = df.fillna(replacement_values_dict)
    
    years_sold = df['Yr Sold'] - df['Year Built']
    years_since_remod = df['Yr Sold'] - df['Year Remod/Add']
    df['Years Before Sale'] = years_sold
    df['Years Since Remod'] = years_since_remod
    df = df.drop([1702, 2180, 2181], axis=0)

    df = df.drop(["PID", "Order", "Mo Sold", "Sale Condition", "Sale Type", "Year Built", "Year Remod/Add"], axis=1)
    return df

def select_features(df, coeff_threshold=0.4, uniq_threshold=10):
    numerical_df = df.select_dtypes(include=['int', 'float'])
    abs_corr_coeffs = numerical_df.corr()['SalePrice'].abs().sort_values()
    df = df.drop(abs_corr_coeffs[abs_corr_coeffs < coeff_threshold].index, axis=1)
    
    nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                    "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                    "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                    "Misc Feature", "Sale Type", "Sale Condition"]
    
    transform_cat_cols = []
    for col in nominal_features:
        if col in df.columns:
            transform_cat_cols.append(col)

    uniqueness_counts = df[transform_cat_cols].apply(lambda col: len(col.value_counts())).sort_values()
    drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > 10].index
    df = df.drop(drop_nonuniq_cols, axis=1)
    
    text_cols = df.select_dtypes(include=['object'])
    for col in text_cols:
        df[col] = df[col].astype('category')
    df = pd.concat([df, pd.get_dummies(df.select_dtypes(include=['category']))], axis=1)
    
    return df

def train_and_test(df, k=0):
    numeric_df = df.select_dtypes(include=['integer', 'float'])
    features = numeric_df.columns.drop("SalePrice")
    lr = linear_model.LinearRegression()
    
    if k == 0:
        train = df[:1460]
        test = df[1460:]

        lr.fit(train[features], train["SalePrice"])
        predictions = lr.predict(test[features])
        mse = mean_squared_error(test["SalePrice"], predictions)
        rmse = np.sqrt(mse)

        return rmse
    
    if k == 1:
        # Randomize *all* rows (frac=1) from `df` and return
        shuffled_df = df.sample(frac=1, )
        train = df[:1460]
        test = df[1460:]
        
        lr.fit(train[features], train["SalePrice"])
        predictions_one = lr.predict(test[features])        
        
        mse_one = mean_squared_error(test["SalePrice"], predictions_one)
        rmse_one = np.sqrt(mse_one)
        
        lr.fit(test[features], test["SalePrice"])
        predictions_two = lr.predict(train[features])        
       
        mse_two = mean_squared_error(train["SalePrice"], predictions_two)
        rmse_two = np.sqrt(mse_two)
        
        avg_rmse = np.mean([rmse_one, rmse_two])
        print(rmse_one)
        print(rmse_two)
        return avg_rmse
    else:
        kf = KFold(n_splits=k, shuffle=True)
        rmse_values = []
        for train_index, test_index, in kf.split(df):
            train = df.iloc[train_index]
            test = df.iloc[test_index]
            lr.fit(train[features], train["SalePrice"])
            predictions = lr.predict(test[features])
            mse = mean_squared_error(test["SalePrice"], predictions)
            rmse = np.sqrt(mse)
            rmse_values.append(rmse)
        print(rmse_values)
        avg_rmse = np.mean(rmse_values)
        return avg_rmse

df = pd.read_csv("AmesHousing.tsv", delimiter="\t")
transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df, k=4)

rmse
[25761.875549560471, 36527.812968130842, 24956.485193881424, 28486.738135675929]
28933.227961812168