%reload_ext autoreload
%autoreload 2
%matplotlib inline
Predict what properties would have sold for, if sold in 2019
import os
import pandas as pd
from ast import literal_eval
from datetime import datetime
import matplotlib.ticker as plticker
filepath = os.path.realpath('../data/shared/prepared.csv')
df = pd.read_csv(filepath)
df['transaction_at'] = pd.to_numeric(pd.to_datetime(df['transaction_at']))
ax = df.plot.scatter('transaction_at', 'price_paid', s=0.01, figsize=(25,15))
ax.set_ylim(0, 300000)
ax.set_xticklabels([datetime.fromtimestamp(ts / 1e9).strftime('%y/%m') for ts in ax.get_xticks()])
ax
Major cities like London / Bristol / Manchester, and some coastlines, are visible.
df.plot.scatter('lon', 'lat', s=0.02, figsize=(10,10))
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
Training a random forest to predict price paid based upon location & transaction date.
X = np.array(df[['lat', 'lon', 'transaction_at']])
y = np.array(df['price_paid'])
X_train, X_test, y_train, y_test = train_test_split(
X, y, train_size=0.8, random_state=4)
# prevents overfitting
min_samples_leaf = 10
regr_rf = RandomForestRegressor(n_estimators=64, min_samples_leaf=min_samples_leaf,
random_state=2, n_jobs=-1)
regr_rf.fit(X_train, y_train)
y_pred_test = regr_rf.predict(X_test)
y_pred_train = regr_rf.predict(X_train)
import matplotlib.pyplot as plt
Run predictions on both training & validation data to ensure the distributions look similar.
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,10))
ax = axes[0]
pd.DataFrame({'test': y_test, 'pred': y_pred_test}).plot.scatter('test', 'pred', s=0.01, ax=ax)
ax.set_ylim(0, 300000)
ax.set_xlim(0, 300000)
ax.grid()
ax = axes[1]
pd.DataFrame({'train': y_train, 'pred': y_pred_train}).sample(frac=0.25).plot.scatter('train', 'pred', s=0.01, ax=ax)
ax.set_ylim(0, 300000)
ax.set_xlim(0, 300000)
ax.grid()
X_2019 = np.array(df[['lat', 'lon']])
today = np.datetime64(pd.to_datetime('2019-10-01'), 'ns').astype("float")
np.repeat(today, len(X_2019))
X_2019 = np.c_[X_2019, np.repeat(today, len(X_2019))]
X_2019
If transaction was in 2019.
price_paid_pred_2019
accounts for location and transaction date only.
price_adjusted_2019
also takes the property's actual price paid into account. It is rounded down to the nearest £1000.
df['price_paid_pred'] = regr_rf.predict(X).astype("int")
df['price_paid_pred_2019'] = regr_rf.predict(X_2019).astype("int")
df['price_adjusted_2019'] = (df['price_paid_pred_2019'] * (df['price_paid'] / df['price_paid_pred']) / 1000).astype("int") * 1000
The adjusted prices should remain stable over time, which they do.
ax = df.plot.scatter('transaction_at', 'price_adjusted_2019', s=0.01, figsize=(25,15))
ax.set_ylim(0, 600000)
ax.set_xticklabels([datetime.fromtimestamp(ts / 1e9).strftime('%y/%m') for ts in ax.get_xticks()])
ax
The vast majority of houses appeared to increase in value. Three clusters are visible.
ax = df.plot.scatter('price_paid_pred_2019', 'price_paid_pred', s=0.00025, figsize=(10,10))
ax.set_xlim(50000, 600000)
ax.set_ylim(0, 600000)
ax
df
output = os.path.realpath('../data/shared/adjusted.csv')
df.to_csv(output)