Loading
The dataset commonly known as the California Housing dataset, is indeed well-known in the field of machine learning and data science. It’s often used for educational purposes, especially in regression analysis tasks to predict house values based on various features.
You can download the file, change rows or edit as you wish, keeping the format. Then upload it and see the regression results in google docs.
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
# Load the housing data from a CSV file into a DataFrame
df = pd.read_csv('housing.csv')
# Fill missing values in the 'total_bedrooms' column with the median
median = df["total_bedrooms"].median()
df["total_bedrooms"].fillna(median, inplace=True)
# Split the data into categorical, numerical, and target columns
df_num=df.drop(["ocean_proximity","longitude","latitude","median_house_value"],
axis=1)
df_cat = df[["ocean_proximity"]]
y = df["median_house_value"]
# Handle Categorical Attributes by one-hot encoding
onehot_encoder = OneHotEncoder()
df_cat_encoded = onehot_encoder.fit_transform(df_cat).toarray()
# Handle Numerical Attributes by standard scaling
standar_scaler = StandardScaler()
df_num_scaled = standar_scaler.fit_transform(df_num)
# Concatenate the one-hot encoded categorical and scaled numerical data
X = np.concatenate((df_num_scaled, df_cat_encoded), axis=1)
# Create a Linear Regression model, fit it to the data, and make predictions
lin_reg = LinearRegression()
lin_reg.fit(X, y)
predictions = lin_reg.predict(X)
# Calculate Mean Squared Error (MSE) and Root Mean Squared Error (RMSE)
lin_mse = mean_squared_error(y, predictions)
lin_rmse = np.sqrt(lin_mse)
# Print the MSE and RMSE for the linear regression model
print("Mean Squared Error (MSE):", lin_mse)
print("Root Mean Squared Error (RMSE):", lin_rmse)