In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

Adjust Price Paid

Predict what properties would have sold for, if sold in 2019

In [2]:
import os
import pandas as pd
from ast import literal_eval
from datetime import datetime
import matplotlib.ticker as plticker

filepath = os.path.realpath('../data/shared/prepared.csv')
In [3]:
df = pd.read_csv(filepath)
df['transaction_at'] = pd.to_numeric(pd.to_datetime(df['transaction_at']))

Price paid distribution over time

In [4]:
ax = df.plot.scatter('transaction_at', 'price_paid', s=0.01, figsize=(25,15))
ax.set_ylim(0, 300000)
ax.set_xticklabels([datetime.fromtimestamp(ts / 1e9).strftime('%y/%m') for ts in ax.get_xticks()])
ax
Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f47b8e56e10>

Location distribution

Major cities like London / Bristol / Manchester, and some coastlines, are visible.

In [5]:
df.plot.scatter('lon', 'lat', s=0.02, figsize=(10,10))
Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd911e8c390>
In [6]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

Training

Training a random forest to predict price paid based upon location & transaction date.

In [7]:
X = np.array(df[['lat', 'lon', 'transaction_at']])
y = np.array(df['price_paid'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, random_state=4)

# prevents overfitting
min_samples_leaf = 10

regr_rf = RandomForestRegressor(n_estimators=64, min_samples_leaf=min_samples_leaf,
                                random_state=2, n_jobs=-1)
regr_rf.fit(X_train, y_train)

y_pred_test = regr_rf.predict(X_test)
y_pred_train = regr_rf.predict(X_train)
In [8]:
import matplotlib.pyplot as plt

Tuning overfit

Run predictions on both training & validation data to ensure the distributions look similar.

In [9]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,10))

ax = axes[0]
pd.DataFrame({'test': y_test, 'pred': y_pred_test}).plot.scatter('test', 'pred', s=0.01, ax=ax)
ax.set_ylim(0, 300000)
ax.set_xlim(0, 300000)
ax.grid()

ax = axes[1]
pd.DataFrame({'train': y_train, 'pred': y_pred_train}).sample(frac=0.25).plot.scatter('train', 'pred', s=0.01, ax=ax)
ax.set_ylim(0, 300000)
ax.set_xlim(0, 300000)
ax.grid()
In [12]:
X_2019 = np.array(df[['lat', 'lon']])
today = np.datetime64(pd.to_datetime('2019-10-01'), 'ns').astype("float")
np.repeat(today, len(X_2019))
X_2019 = np.c_[X_2019, np.repeat(today, len(X_2019))]
X_2019
Out[12]:
array([[ 5.21391131e+01, -4.59446286e-01,  1.56988800e+18],
       [ 5.21321279e+01, -4.65166230e-01,  1.56988800e+18],
       [ 5.21146255e+01, -4.98956071e-01,  1.56988800e+18],
       ...,
       [ 5.21137015e+01, -1.90448267e+00,  1.56988800e+18],
       [ 5.21642858e+01, -2.19858199e+00,  1.56988800e+18],
       [ 5.20901039e+01, -1.93776448e+00,  1.56988800e+18]])

Predict price paid

If transaction was in 2019.

price_paid_pred_2019 accounts for location and transaction date only.

price_adjusted_2019 also takes the property's actual price paid into account. It is rounded down to the nearest £1000.

In [13]:
df['price_paid_pred'] = regr_rf.predict(X).astype("int")
df['price_paid_pred_2019'] = regr_rf.predict(X_2019).astype("int")
df['price_adjusted_2019'] = (df['price_paid_pred_2019'] * (df['price_paid'] / df['price_paid_pred']) / 1000).astype("int") * 1000

Validating adjusted prices

The adjusted prices should remain stable over time, which they do.

In [15]:
ax = df.plot.scatter('transaction_at', 'price_adjusted_2019', s=0.01, figsize=(25,15))
ax.set_ylim(0, 600000)
ax.set_xticklabels([datetime.fromtimestamp(ts / 1e9).strftime('%y/%m') for ts in ax.get_xticks()])
ax
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd8fdea0fd0>

How significant were our adjustments?

The vast majority of houses appeared to increase in value. Three clusters are visible.

In [16]:
ax = df.plot.scatter('price_paid_pred_2019', 'price_paid_pred', s=0.00025, figsize=(10,10))
ax.set_xlim(50000, 600000)
ax.set_ylim(0, 600000)
ax
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd8fdf053d0>
In [18]:
df
Out[18]:
latlonprice_paidtransaction_atprice_paid_predprice_paid_pred_2019price_adjusted_2019
052.139113-0.459446250000809913600000000000542337945703662000
152.132128-0.4651664200079358400000000000056266430472321000
252.114625-0.4989566000080887680000000000079377530769401000
351.912936-0.5005073450081233280000000000041292237474198000
451.916877-0.6779972850080974080000000000042504343812230000
........................
136694552.086601-1.9615282270001548892800000000000198178203705233000
136694652.123197-2.0332002525001561939200000000000318676323636256000
136694752.113702-1.9044834350001562112000000000000321009320438434000
136694852.164286-2.1985822710001563840000000000000274032297396294000
136694952.090104-1.9377641600001562630400000000000196972204814166000

1366950 rows × 7 columns

In [ ]:
output = os.path.realpath('../data/shared/adjusted.csv')
df.to_csv(output)