-
Notifications
You must be signed in to change notification settings - Fork 0
/
project_helper.py
134 lines (94 loc) · 4.04 KB
/
project_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import alphalens as al
import graphviz
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import Image
from sklearn.tree import export_graphviz
from zipline.assets._assets import Equity # Required for USEquityPricing
from zipline.pipeline.data import USEquityPricing
from zipline.pipeline.classifiers import Classifier
from zipline.pipeline.engine import SimplePipelineEngine
from zipline.pipeline.loaders import USEquityPricingLoader
from zipline.utils.numpy_utils import int64_dtype
EOD_BUNDLE_NAME = 'eod-quotemedia'
class PricingLoader(object):
def __init__(self, bundle_data):
self.loader = USEquityPricingLoader(
bundle_data.equity_daily_bar_reader,
bundle_data.adjustment_reader)
def get_loader(self, column):
if column not in USEquityPricing.columns:
raise Exception('Column not in USEquityPricing')
return self.loader
class Sector(Classifier):
dtype = int64_dtype
window_length = 0
inputs = ()
missing_value = -1
def __init__(self):
self.data = np.load('../../data/project_7_sector/data.npy')
def _compute(self, arrays, dates, assets, mask):
return np.where(
mask,
self.data[assets],
self.missing_value,
)
def build_pipeline_engine(bundle_data, trading_calendar):
pricing_loader = PricingLoader(bundle_data)
engine = SimplePipelineEngine(
get_loader=pricing_loader.get_loader,
calendar=trading_calendar.all_sessions,
asset_finder=bundle_data.asset_finder)
return engine
def plot_tree_classifier(clf, feature_names=None):
dot_data = export_graphviz(
clf,
out_file=None,
feature_names=feature_names,
filled=True,
rounded=True,
special_characters=True,
rotate=True)
return Image(graphviz.Source(dot_data).pipe(format='png'))
def plot(xs, ys, labels, title='', x_label='', y_label=''):
for x, y, label in zip(xs, ys, labels):
plt.ylim((0.5, 0.55))
plt.plot(x, y, label=label)
plt.title(title)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.legend(bbox_to_anchor=(1.04, 1), borderaxespad=0)
plt.show()
def rank_features_by_importance(importances, feature_names):
indices = np.argsort(importances)[::-1]
max_feature_name_length = max([len(feature) for feature in feature_names])
print(' Feature{space: <{padding}} Importance'.format(padding=max_feature_name_length - 8, space=' '))
for x_train_i in range(len(importances)):
print('{number:>2}. {feature: <{padding}} ({importance})'.format(
number=x_train_i + 1,
padding=max_feature_name_length,
feature=feature_names[indices[x_train_i]],
importance=importances[indices[x_train_i]]))
def sharpe_ratio(factor_returns, annualization_factor=np.sqrt(252)):
return annualization_factor * factor_returns.mean() / factor_returns.std()
def get_factor_returns(factor_data):
ls_factor_returns = pd.DataFrame()
for factor, factor_data in factor_data.items():
ls_factor_returns[factor] = al.performance.factor_returns(factor_data).iloc[:, 0]
return ls_factor_returns
def plot_factor_returns(factor_returns):
(1 + factor_returns).cumprod().plot(ylim=(0.8, 1.2))
def plot_factor_rank_autocorrelation(factor_data):
ls_FRA = pd.DataFrame()
unixt_factor_data = {
factor: factor_data.set_index(pd.MultiIndex.from_tuples(
[(x.timestamp(), y) for x, y in factor_data.index.values],
names=['date', 'asset']))
for factor, factor_data in factor_data.items()}
for factor, factor_data in unixt_factor_data.items():
ls_FRA[factor] = al.performance.factor_rank_autocorrelation(factor_data)
ls_FRA.plot(title="Factor Rank Autocorrelation", ylim=(0.8, 1.0))
def build_factor_data(factor_data, pricing):
return {factor_name: al.utils.get_clean_factor_and_forward_returns(factor=data, prices=pricing, periods=[1])
for factor_name, data in factor_data.iteritems()}