In [1]:
import numpy as np
from vowpalwabbit import pyvw
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from vowpalwabbit.sklearn_vw import VWClassifier
import re
import itertools
from operator import itemgetter
from collections import Counter
import matplotlib.pyplot as plt
import pickle
import pandas as pd
/home/nehal/.virtenvs/odc-challenge/local/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
In [2]:
%matplotlib inline
In [3]:
DIR = "./data/"
In [ ]:
X, y = datasets.make_hastie_10_2(n_samples=10000, random_state=1)
X = X.astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=256)
model = VWClassifier()
model.fit(X_train, y_train)
model.score(X_train, y_train)
model.score(X_test, y_test)
In [ ]:
import re
import ast
from collections import defaultdict
feature_dict = defaultdict(list)
regex = r"\(\d+, \d+.[\de-]+\)"
In [ ]:
vw_compatible_file = open("./vw_compatible_file.vw", 'wb')
with open(DIR + 'total_spend.txt', 'r') as train_data:
    for line in itertools.islice(train_data, 100000, 600000):
        el = eval(line)
        vw_compatible_file.write(str(el[1]) + " '" + str(el[0]) + " | ")
        for item in el[2][1]:
            vw_compatible_file.write(str(item[0]) + ":" + str(item[1]) + " ")
        vw_compatible_file.write("\n")
vw_compatible_file.close()
In [8]:
total_spend_df = pd.read_csv(DIR + 'total_spend.txt', sep=",", header = None)
In [32]:
len(total_spend_df.index)
Out[32]:
12440009
In [10]:
total_spend_df.columns = ["Household ID", "Total Expenditure"]
In [38]:
common_50 = Counter(total_spend_df["Total Expenditure"]).most_common(50)
In [40]:
common_50_x = []
common_50_y = []
for item in common_50:
    common_50_x.append(item[0])
    common_50_y.append(item[1])
In [50]:
plt.plot(common_50_x, common_50_y, '-o')
plt.xlabel("$ Spend")
plt.ylabel("Number of Households")
Out[50]:
<matplotlib.text.Text at 0x7f2ae82adb38>
In [19]:
cutoff_total_spend_df = total_spend_df[(total_spend_df['Total Expenditure'] < 500) & total_spend_df['Total Expenditure'] > 0]
In [29]:
upper_limit_total_spend_df = total_spend_df[(total_spend_df["Total Expenditure"] > 500) & (total_spend_df["Total Expenditure"] < 10000)]
In [20]:
cutoff_total_spend_df
Out[20]:
Household ID Total Expenditure
0 2964841 126.00
1 31389918 365.88
3 31390092 344.25
4 31390102 60.00
5 31390134 179.00
7 31390210 229.99
9 31390279 179.94
10 31390294 14.99
11 31390306 94.49
13 2964853 216.23
15 31390346 49.50
16 485583 44.50
18 31390388 170.00
19 31390444 43.18
20 31390470 417.42
21 31390501 397.85
22 31390502 434.87
23 31390515 65.97
24 31390566 304.84
25 31390592 287.96
27 31390624 186.00
28 485590 22.98
29 31390636 87.50
30 31390673 34.50
31 31390679 183.09
32 31390820 33.98
33 31390822 229.85
36 485599 209.40
37 2964861 230.32
39 31390934 13.99
... ... ...
12439975 31388979 378.00
12439976 31389000 368.64
12439977 31389052 29.99
12439978 31389072 111.97
12439979 485552 39.98
12439980 31389087 257.08
12439981 31389100 213.75
12439983 31389168 39.16
12439984 31389232 259.80
12439986 31389257 74.00
12439987 31389271 61.20
12439988 31389299 49.99
12439990 31389326 179.41
12439992 31389359 74.99
12439993 31389365 144.16
12439994 31389434 487.01
12439995 31389438 69.98
12439996 485561 498.80
12439997 31389468 497.25
12439998 31389478 246.90
12439999 31389506 188.74
12440000 31389509 84.97
12440001 31389599 252.69
12440002 31389641 449.83
12440003 31389652 37.97
12440004 31389655 239.94
12440005 31389673 277.24
12440006 31389710 174.00
12440007 31389733 19.99
12440008 2964839 59.99

9069132 rows × 2 columns

In [27]:
upper_limit_total_spend_df
Out[27]:
Household ID Total Expenditure
2 31390077 2138.49
6 31390208 5831.89
8 31390260 1472.04
12 2964852 2420.45
14 31390343 659.87
17 31390372 593.85
26 31390605 4902.59
34 485597 594.95
35 31390877 1465.11
38 31390923 3145.63
41 31391000 2139.35
43 31391100 5011.30
44 2964865 726.80
49 31391248 3005.52
57 31391438 1837.67
59 31391457 1216.98
60 485614 5041.55
63 31391644 722.67
65 31391660 518.41
67 31391837 3839.46
68 485622 2955.95
69 31391930 573.34
78 31392134 2109.02
82 31392210 569.82
84 31392247 925.76
87 485633 2633.61
89 31392362 2179.97
92 31392451 635.70
93 31392496 2552.98
98 31392699 688.92
... ... ...
12439894 31386848 3228.40
12439895 31386878 507.60
12439897 485484 3827.59
12439898 31386971 540.38
12439900 2964703 727.76
12439902 2964707 1007.43
12439908 31387415 2895.50
12439909 31387445 524.80
12439910 2964735 1533.16
12439917 31387557 724.84
12439919 31387587 5271.92
12439922 31387658 543.54
12439923 31387704 906.06
12439925 31387742 16963.16
12439926 485513 2210.42
12439934 31387834 1450.96
12439935 31387874 1547.55
12439937 31387890 823.64
12439942 31388164 723.44
12439944 2964781 1945.55
12439950 31388266 545.41
12439958 31388562 2237.57
12439960 46748 830.56
12439966 31388741 763.15
12439971 31388870 1818.45
12439973 31388962 1116.90
12439982 31389105 7221.62
12439985 31389256 670.75
12439989 485558 2343.14
12439991 31389334 875.75

3370510 rows × 2 columns

In [30]:
plt.hist(cutoff_total_spend_df["Total Expenditure"], bins=50)
plt.xlabel("$ expenditure in (0,500)")
plt.ylabel("Number of Households")
plt.title("Spend trend of households")
Out[30]:
<matplotlib.text.Text at 0x7f2ae808e898>
In [31]:
plt.hist(upper_limit_total_spend_df["Total Expenditure"], bins=50)
plt.xlabel("$ expenditure in (500:)")
plt.ylabel("Number of Households")
plt.title("Spend trend of households")
Out[31]:
<matplotlib.text.Text at 0x7f2ae89c5ef0>
In [86]:
import re
import ast
from collections import defaultdict
feature_dict = defaultdict(list)
regex = r"\(\d+, \d+.[\de-]+\)"
In [88]:
with open(DIR + "train.txt", 'rb') as f:
    for line in itertools.islice(f, 0, 100000):
        features_tuples = re.findall(regex, line.decode("utf-8"), flags=0)
        store_features = [ast.literal_eval(i) for i in features_tuples]
        for pair in store_features:
            feature_dict[pair[0]].append(pair[1])
In [108]:
# del feature_dict
In [92]:
# len(feature_dict)
with open('feature_dict_100000_hp.pickle', 'wb') as handle:
  pickle.dump(feature_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)