You are on page 1of 2

36:

### Question: create a full pipeline which includes numerical pipeline that
### includes imputer, attribute added and scalar
### such that extra columns are "rooms_per_household",
### "population_per_household" and "bedrooms_per_room"
### plus one hot encoder as a categorical pipeline
### then casting the resulting values from the full pipepline
### into a dataframe table then show that table

from sklearn.pipeline import Pipeline


from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', FunctionTransformer(add_extra_features, validate=False)),
('std_scaler', StandardScaler()),
])

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)
extra_columns =
["rooms_per_household","population_per_household","bedrooms_per_room"]
columns = list(housing.columns)[:-1]+extra_columns+list(cat_encoder.categories_[0])

housing_pipeline = pd.DataFrame(housing_prepared,
columns=columns,
index=housing.index)
housing_pipeline

37:

### Question: Create a linear regression model for housing_prepared as X


### and housing_labels as Y, then obtain predictions for the
### housing_prepared data
### then get rmse (root mean squared error) and mae (mean
### absolute error) and print them

from sklearn.linear_model import LinearRegression


from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
housing_predictions = lin_reg.predict(housing_prepared)

lin_mse = mean_squared_error(housing_labels, housing_predictions)


lin_rmse = np.sqrt(lin_mse)
print("RMSE:", lin_rmse)

lin_mae = mean_absolute_error(housing_labels, housing_predictions)


print("MAE:", lin_mae)

7:

### Question: Create a function called split_train_test to split the


### data using a given test_ratio then call that function
### with data as the housing table and test_ratio=0.2
### then print the length of the training and testing sets

import numpy as np

def split_train_test(data, test_ratio):


shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]

train_set, test_set = split_train_test(housing, 0.2)


print(len(train_set), "train +", len(test_set), "test")

8:

### Cell 8 ### Included ### Included ###


### Question: split the housing table using test_ratio=0.2
### using function from sklearn package
### then print the length of the training and testing sets

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

print(len(train_set), "train +", len(test_set), "test")

You might also like