From 3e92169f859a06b26683be24178c42dfb21efaae Mon Sep 17 00:00:00 2001 From: bxdd Date: Wed, 10 Mar 2021 01:56:02 +0900 Subject: [PATCH 01/30] add period ops class --- qlib/config.py | 3 + qlib/data/base.py | 206 ++++++ qlib/data/cache.py | 4 +- qlib/data/data.py | 4 +- qlib/data/ops.py | 34 +- qlib/data/ops_period.py | 1431 +++++++++++++++++++++++++++++++++++++++ qlib/utils/__init__.py | 2 +- 7 files changed, 1645 insertions(+), 39 deletions(-) create mode 100644 qlib/data/ops_period.py diff --git a/qlib/config.py b/qlib/config.py index 52b05568d5..d1f8897898 100644 --- a/qlib/config.py +++ b/qlib/config.py @@ -189,6 +189,7 @@ def set_conf_from_C(self, config_c): "region": REG_CN, ## Custom Operator "custom_ops": [], + "custom_period_ops": [], }, } @@ -297,11 +298,13 @@ def set(self, default_conf="client", **kwargs): def register(self): from .utils import init_instance_by_config from .data.ops import register_all_ops + from .data.ops_period import register_all_period_ops from .data.data import register_all_wrappers from .workflow import R, QlibRecorder from .workflow.utils import experiment_exit_handler register_all_ops(self) + register_all_period_ops(self) register_all_wrappers(self) # set up QlibRecorder exp_manager = init_instance_by_config(self["exp_manager"]) diff --git a/qlib/data/base.py b/qlib/data/base.py index e318843c4a..c3a74545e8 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -9,6 +9,7 @@ import pandas as pd + class Expression(abc.ABC): """Expression base class""" @@ -224,3 +225,208 @@ class ExpressionOps(Expression): """ pass + + +class PExpression(abc.ABC): + """PExpression base class""" + + def __str__(self): + return type(self).__name__ + + def __repr__(self): + return str(self) + + def __gt__(self, other): + from .ops import PGt + + return PGt(self, other) + + def __ge__(self, other): + from .ops import PGe + + return PGe(self, other) + + def __lt__(self, other): + from .ops import PLt + + return PLt(self, other) + + def __le__(self, other): + from .ops import PLe + + return PLe(self, other) + + def __eq__(self, other): + from .ops import PEq + + return PEq(self, other) + + def __ne__(self, other): + from .ops import PNe + + return PNe(self, other) + + def __add__(self, other): + from .ops import PAdd + + return PAdd(self, other) + + def __radd__(self, other): + from .ops import PAdd + + return PAdd(other, self) + + def __sub__(self, other): + from .ops import PSub + + return PSub(self, other) + + def __rsub__(self, other): + from .ops import PSub + + return PSub(other, self) + + def __mul__(self, other): + from .ops import PMul + + return PMul(self, other) + + def __rmul__(self, other): + from .ops import PMul + + return PMul(self, other) + + def __div__(self, other): + from .ops import PDiv + + return PDiv(self, other) + + def __rdiv__(self, other): + from .ops import PDiv + + return PDiv(other, self) + + def __truediv__(self, other): + from .ops import PDiv + + return PDiv(self, other) + + def __rtruediv__(self, other): + from .ops import PDiv + + return PDiv(other, self) + + def __pow__(self, other): + from .ops import PPower + + return PPower(self, other) + + def __and__(self, other): + from .ops import PAnd + + return PAnd(self, other) + + def __rand__(self, other): + from .ops import PAnd + + return PAnd(other, self) + + def __or__(self, other): + from .ops import POr + + return POr(self, other) + + def __ror__(self, other): + from .ops import POr + + return POr(other, self) + + + @abc.abstractmethod + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + raise NotImplementedError("This function must be implemented in your newly defined feature") + + @abc.abstractmethod + def get_period_offset(self, cur_index): + raise NotImplementedError("This function must be implemented in your newly defined feature") + + def load(self, instrument, start_index, end_index, freq): + from .cache import H + + # cache + args = str(self), instrument, start_index, end_index, freq + if args in H["f"]: + return H["f"][args] + if start_index is None or end_index is None or start_index > end_index: + raise ValueError("Invalid index range: {} {}".format(start_index, end_index)) + + resample_series = pd.Series(index=pd.RangeIndex(start_index, end_index + 1), dtype='float32', name=str(self)) + for cur_index in range(start_index, end_index + 1): + start_offset, end_offset = self.get_period_offset(cur_index) + resample_data[cur_index] = self.load_period_data(instrument, start_offset, end_offset, cur_index).iloc[-1] + + H["f"][args] = resample_series + return resample_data + + def get_longest_back_rolling(self): + return 0 + + def get_extended_window_size(self): + return 0, 0 + + +class PFeature(PExpression): + + def __init__(self, name=None): + if name: + self._name = name.lower() + else: + self._name = type(self).__name__.lower() + + def __str__(self): + return "$" + self._name + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + ### Zhou Code + return pd.Series([1,2,3]) + + def get_period_offset(self, cur_index): + return 0 + + +class PExpressionOps(PExpression): + """Operator Expression + + This kind of feature will use operator for feature + construction on the fly. + """ + + pass + +class OpsWrapper: + """Ops Wrapper""" + + def __init__(self): + self._ops = {} + + def reset(self): + self._ops = {} + + def register(self, ops_list): + for operator in ops_list: + if not issubclass(operator, ExpressionOps) and not issubclass(operator, PExpressionOps): + raise TypeError("operator must be subclass of ExpressionOps or PExpressionOps, not {}".format(operator)) + + if operator.__name__ in self._ops: + get_module_logger(self.__class__.__name__).warning( + "The custom operator [{}] will override the qlib default definition".format(operator.__name__) + ) + self._ops[operator.__name__] = operator + + def __getattr__(self, key): + if key not in self._ops: + raise AttributeError("The operator [{0}] is not registered".format(key)) + return self._ops[key] + + +Operators = OpsWrapper() \ No newline at end of file diff --git a/qlib/data/cache.py b/qlib/data/cache.py index 0174dc63f1..23a888d98c 100644 --- a/qlib/data/cache.py +++ b/qlib/data/cache.py @@ -31,9 +31,7 @@ ) from ..log import get_module_logger -from .base import Feature - -from .ops import Operators +from .base import Feature, PFeature, Operators class QlibCacheException(RuntimeError): diff --git a/qlib/data/data.py b/qlib/data/data.py index 762467da35..bce173eeaa 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -19,10 +19,9 @@ from .cache import H from ..config import C -from .ops import Operators from ..log import get_module_logger from ..utils import parse_field, read_bin, hash_args, normalize_cache_fields, code_to_fname -from .base import Feature +from .base import Feature, PFeature, Operators from .cache import DiskDatasetCache, DiskExpressionCache from ..utils import Wrapper, init_instance_by_config, register_wrapper, get_module_by_module_path @@ -480,7 +479,6 @@ def expression_calculator(inst, start_time, end_time, freq, column_names, spans= _calendar = Cal.calendar(freq=freq) data.index = _calendar[data.index.values.astype(int)] data.index.names = ["datetime"] - if spans is None: return data else: diff --git a/qlib/data/ops.py b/qlib/data/ops.py index 8bc7e1fa7c..0e698cd421 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -74,7 +74,6 @@ class NpElemOperator(ElemOperator): """ def __init__(self, feature, func): - self.feature = feature self.func = func super(NpElemOperator, self).__init__(feature) @@ -289,8 +288,6 @@ class NpPairOperator(PairOperator): """ def __init__(self, feature_left, feature_right, func): - self.feature_left = feature_left - self.feature_right = feature_right self.func = func super(NpPairOperator, self).__init__(feature_left, feature_right) @@ -1489,39 +1486,12 @@ def __init__(self, feature_left, feature_right, N): ] -class OpsWrapper: - """Ops Wrapper""" - - def __init__(self): - self._ops = {} - - def reset(self): - self._ops = {} - - def register(self, ops_list): - for operator in ops_list: - if not issubclass(operator, ExpressionOps): - raise TypeError("operator must be subclass of ExpressionOps, not {}".format(operator)) - - if operator.__name__ in self._ops: - get_module_logger(self.__class__.__name__).warning( - "The custom operator [{}] will override the qlib default definition".format(operator.__name__) - ) - self._ops[operator.__name__] = operator - - def __getattr__(self, key): - if key not in self._ops: - raise AttributeError("The operator [{0}] is not registered".format(key)) - return self._ops[key] - - -Operators = OpsWrapper() - - def register_all_ops(C): """register all operator""" logger = get_module_logger("ops") + from .base import Operators + Operators.reset() Operators.register(OpsList) diff --git a/qlib/data/ops_period.py b/qlib/data/ops_period.py new file mode 100644 index 0000000000..7409c2ac15 --- /dev/null +++ b/qlib/data/ops_period.py @@ -0,0 +1,1431 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + + +from __future__ import division +from __future__ import print_function + +import sys +import abc +import numpy as np +import pandas as pd + +from scipy.stats import percentileofscore + +from .base import PExpression, PExpressionOps +from ..log import get_module_logger + +try: + from ._libs.rolling import rolling_slope, rolling_rsquare, rolling_resi + from ._libs.expanding import expanding_slope, expanding_rsquare, expanding_resi +except ImportError: + print( + "#### Do not import qlib package in the repository directory in case of importing qlib from . without compiling #####" + ) + raise + + +np.seterr(invalid="ignore") + +#################### Element-Wise Operator #################### + + +class PElemOperator(PExpressionOps): + + def __init__(self, feature): + self.feature = feature + + def __str__(self): + return "{}({})".format(type(self).__name__, self.feature) + + def get_period_offset(self, cur_index): + return self.feature.get_period_offset(cur_index) + + +class PNpElemOperator(PElemOperator): + """Numpy Element-wise Operator + + Parameters + ---------- + feature : Expression + feature instance + func : str + numpy feature operation method + + Returns + ---------- + Expression + feature operation output + """ + + def __init__(self, feature, func): + self.func = func + super(PNpElemOperator, self).__init__(feature) + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + return getattr(np, self.func)(series) + +class PAbs(PNpElemOperator): + """Feature Absolute Value + + Parameters + ---------- + feature : Expression + feature instance + + Returns + ---------- + Expression + a feature instance with absolute output + """ + + def __init__(self, feature): + super(PAbs, self).__init__(feature, "abs") + + +class PSign(PNpElemOperator): + """Feature PSign + + Parameters + ---------- + feature : Expression + feature instance + + Returns + ---------- + Expression + a feature instance with sign + """ + + def __init__(self, feature): + super(PSign, self).__init__(feature, "sign") + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + """ + To avoid error raised by bool type input, we transform the data into float32. + """ + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + # TODO: More precision types should be configurable + series = series.astype(np.float32) + return getattr(np, self.func)(series) + + +class PLog(PNpElemOperator): + """Feature PLog + + Parameters + ---------- + feature : Expression + feature instance + + Returns + ---------- + Expression + a feature instance with log + """ + + def __init__(self, feature): + super(PLog, self).__init__(feature, "log") + + +class PPower(PNpElemOperator): + """Feature PPower + + Parameters + ---------- + feature : Expression + feature instance + + Returns + ---------- + Expression + a feature instance with power + """ + + def __init__(self, feature, exponent): + super(PPower, self).__init__(feature, "power") + self.exponent = exponent + + def __str__(self): + return "{}({},{})".format(type(self).__name__, self.feature, self.exponent) + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + return getattr(np, self.func)(series, self.exponent) + + +class PMask(PNpElemOperator): + """Feature PMask + + Parameters + ---------- + feature : Expression + feature instance + instrument : str + instrument mask + + Returns + ---------- + Expression + a feature instance with masked instrument + """ + + def __init__(self, feature, instrument): + super(PMask, self).__init__(feature, "mask") + self.instrument = instrument + + def __str__(self): + return "{}({},{})".format(type(self).__name__, self.feature, self.instrument.lower()) + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + + return self.feature.load_period_data(self.instrument, start_offset, end_offset, cur_index) + + +class PNot(PNpElemOperator): + """PNot Operator + + Parameters + ---------- + feature_left : Expression + feature instance + feature_right : Expression + feature instance + + Returns + ---------- + Feature: + feature elementwise not output + """ + + def __init__(self, feature): + super(PNot, self).__init__(feature, "bitwise_not") + + +#################### Pair-Wise Operator #################### +class PPairOperator(PExpressionOps): + """Pair-wise operator + + Parameters + ---------- + feature_left : Expression + feature instance or numeric value + feature_right : Expression + feature instance or numeric value + func : str + operator function + + Returns + ---------- + Feature: + two features' operation output + """ + + def __init__(self, feature_left, feature_right): + self.feature_left = feature_left + self.feature_right = feature_right + + def __str__(self): + return "{}({},{})".format(type(self).__name__, self.feature_left, self.feature_right) + + def get_period_offset(self, cur_index): + if isinstance(self.feature_left, PExpression): + left_br = self.feature_left.get_period_offset(cur_index) + else: + left_br = 0 + + if isinstance(self.feature_right, PExpression): + right_br = self.feature_right.get_period_offset(cur_index) + else: + right_br = 0 + return max(left_br, right_br) + +class PNpPairOperator(PPairOperator): + """Numpy Pair-wise operator + + Parameters + ---------- + feature_left : Expression + feature instance or numeric value + feature_right : Expression + feature instance or numeric value + func : str + operator function + + Returns + ---------- + Feature: + two features' operation output + """ + + def __init__(self, feature_left, feature_right, func): + self.feature_left = feature_left + self.feature_right = feature_right + self.func = func + super(PNpPairOperator, self).__init__(feature_left, feature_right) + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + assert any( + [isinstance(self.feature_left, Expression), self.feature_right, Expression] + ), "at least one of two inputs is Expression instance" + if isinstance(self.feature_left, Expression): + series_left = self.feature_left.load_period_data(instrument, start_offset, end_offset, cur_index) + else: + series_left = self.feature_left # numeric value + if isinstance(self.feature_right, Expression): + series_right = self.feature_right.load_period_data(instrument, start_offset, end_offset, cur_index) + else: + series_right = self.feature_right + return getattr(np, self.func)(series_left, series_right) + + +class PAdd(PNpPairOperator): + """PAdd Operator + + Parameters + ---------- + feature_left : Expression + feature instance + feature_right : Expression + feature instance + + Returns + ---------- + Feature: + two features' sum + """ + + def __init__(self, feature_left, feature_right): + super(PAdd, self).__init__(feature_left, feature_right, "add") + + +class PSub(PNpPairOperator): + """Subtract Operator + + Parameters + ---------- + feature_left : Expression + feature instance + feature_right : Expression + feature instance + + Returns + ---------- + Feature: + two features' subtraction + """ + + def __init__(self, feature_left, feature_right): + super(PSub, self).__init__(feature_left, feature_right, "subtract") + + +class PMul(PNpPairOperator): + """Multiply Operator + + Parameters + ---------- + feature_left : Expression + feature instance + feature_right : Expression + feature instance + + Returns + ---------- + Feature: + two features' product + """ + + def __init__(self, feature_left, feature_right): + super(PMul, self).__init__(feature_left, feature_right, "multiply") + + +class PDiv(PNpPairOperator): + """Division Operator + + Parameters + ---------- + feature_left : Expression + feature instance + feature_right : Expression + feature instance + + Returns + ---------- + Feature: + two features' division + """ + + def __init__(self, feature_left, feature_right): + super(PDiv, self).__init__(feature_left, feature_right, "divide") + + +class PGreater(PNpPairOperator): + """PGreater Operator + + Parameters + ---------- + feature_left : Expression + feature instance + feature_right : Expression + feature instance + + Returns + ---------- + Feature: + greater elements taken from the input two features + """ + + def __init__(self, feature_left, feature_right): + super(PGreater, self).__init__(feature_left, feature_right, "maximum") + + +class PLess(PNpPairOperator): + """PLess Operator + + Parameters + ---------- + feature_left : Expression + feature instance + feature_right : Expression + feature instance + + Returns + ---------- + Feature: + smaller elements taken from the input two features + """ + + def __init__(self, feature_left, feature_right): + super(PLess, self).__init__(feature_left, feature_right, "minimum") + + +class PGt(PNpPairOperator): + """PGreater Than Operator + + Parameters + ---------- + feature_left : Expression + feature instance + feature_right : Expression + feature instance + + Returns + ---------- + Feature: + bool series indicate `left > right` + """ + + def __init__(self, feature_left, feature_right): + super(PGt, self).__init__(feature_left, feature_right, "greater") + + +class PGe(PNpPairOperator): + """PGreater Equal Than Operator + + Parameters + ---------- + feature_left : Expression + feature instance + feature_right : Expression + feature instance + + Returns + ---------- + Feature: + bool series indicate `left >= right` + """ + + def __init__(self, feature_left, feature_right): + super(PGe, self).__init__(feature_left, feature_right, "greater_equal") + + +class PLt(PNpPairOperator): + """PLess Than Operator + + Parameters + ---------- + feature_left : Expression + feature instance + feature_right : Expression + feature instance + + Returns + ---------- + Feature: + bool series indicate `left < right` + """ + + def __init__(self, feature_left, feature_right): + super(PLt, self).__init__(feature_left, feature_right, "less") + + +class PLe(PNpPairOperator): + """PLess Equal Than Operator + + Parameters + ---------- + feature_left : Expression + feature instance + feature_right : Expression + feature instance + + Returns + ---------- + Feature: + bool series indicate `left <= right` + """ + + def __init__(self, feature_left, feature_right): + super(PLe, self).__init__(feature_left, feature_right, "less_equal") + + +class PEq(PNpPairOperator): + """Equal Operator + + Parameters + ---------- + feature_left : Expression + feature instance + feature_right : Expression + feature instance + + Returns + ---------- + Feature: + bool series indicate `left == right` + """ + + def __init__(self, feature_left, feature_right): + super(PEq, self).__init__(feature_left, feature_right, "equal") + + +class PNe(PNpPairOperator): + """PNot Equal Operator + + Parameters + ---------- + feature_left : Expression + feature instance + feature_right : Expression + feature instance + + Returns + ---------- + Feature: + bool series indicate `left != right` + """ + + def __init__(self, feature_left, feature_right): + super(PNe, self).__init__(feature_left, feature_right, "not_equal") + + +class PAnd(PNpPairOperator): + """PAnd Operator + + Parameters + ---------- + feature_left : Expression + feature instance + feature_right : Expression + feature instance + + Returns + ---------- + Feature: + two features' row by row & output + """ + + def __init__(self, feature_left, feature_right): + super(PAnd, self).__init__(feature_left, feature_right, "bitwise_and") + + +class POr(PNpPairOperator): + """POr Operator + + Parameters + ---------- + feature_left : Expression + feature instance + feature_right : Expression + feature instance + + Returns + ---------- + Feature: + two features' row by row | outputs + """ + + def __init__(self, feature_left, feature_right): + super(POr, self).__init__(feature_left, feature_right, "bitwise_or") + + +#################### Triple-wise Operator #################### +class PIf(PExpressionOps): + """PIf Operator + + Parameters + ---------- + condition : Expression + feature instance with bool values as condition + feature_left : Expression + feature instance + feature_right : Expression + feature instance + """ + + def __init__(self, condition, feature_left, feature_right): + self.condition = condition + self.feature_left = feature_left + self.feature_right = feature_right + + def __str__(self): + return "PIf({},{},{})".format(self.condition, self.feature_left, self.feature_right) + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + series_cond = self.condition.load_period_data(instrument, start_offset, end_offset, cur_index) + if isinstance(self.feature_left, Expression): + series_left = self.feature_left.load_period_data(instrument, start_offset, end_offset, cur_index) + else: + series_left = self.feature_left + if isinstance(self.feature_right, Expression): + series_right = self.feature_right.load_period_data(instrument, start_offset, end_offset, cur_index) + else: + series_right = self.feature_right + series = pd.Series(np.where(series_cond, series_left, series_right), index=series_cond.index) + return series + + def get_period_offset(self, cur_index): + if isinstance(self.feature_left, Expression): + left_br = self.feature_left.get_period_offset(cur_index) + else: + left_br = 0 + + if isinstance(self.feature_right, Expression): + right_br = self.feature_right.get_period_offset(cur_index) + else: + right_br = 0 + + if isinstance(self.condition, Expression): + c_br = self.condition.get_period_offset(cur_index) + else: + c_br = 0 + return max(left_br, right_br, c_br) + + + +#################### PRolling #################### +# NOTE: methods like `rolling.mean` are optimized with cython, +# and are super faster than `rolling.apply(np.mean)` + + +class PRolling(PExpressionOps): + """PRolling Operator + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + func : str + rolling method + + Returns + ---------- + Expression + rolling outputs + """ + + def __init__(self, feature, N, func): + self.feature = feature + self.N = N + self.func = func + + def __str__(self): + return "{}({},{})".format(type(self).__name__, self.feature, self.N) + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + # NOTE: remove all null check, + # now it's user's responsibility to decide whether use features in null days + # isnull = series.isnull() # NOTE: isnull = NaN, inf is not null + if self.N == 0: + series = getattr(series.expanding(min_periods=1), self.func)() + elif 0 < self.N < 1: + series = series.ewm(alpha=self.N, min_periods=1).mean() + else: + series = getattr(series.rolling(self.N, min_periods=1), self.func)() + # series.iloc[:self.N-1] = np.nan + # series[isnull] = np.nan + return series + + def get_period_offset(self, cur_index): + if self.N == 0: + return np.inf + if 0 < self.N < 1: + return int(np.log(1e-6) / np.log(1 - self.N)) # (1 - N)**window == 1e-6 + return self.feature.get_period_offset(cur_index) + self.N - 1 + + +class PRef(PRolling): + """Feature Reference + + Parameters + ---------- + feature : Expression + feature instance + N : int + N = 0, retrieve the first data; N > 0, retrieve data of N periods ago; N < 0, future data + + Returns + ---------- + Expression + a feature instance with target reference + """ + + def __init__(self, feature, N): + super(PRef, self).__init__(feature, N, "ref") + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + # N = 0, return first day + if series.empty: + return series # Pandas bug, see: https://github.com/pandas-dev/pandas/issues/21049 + elif self.N == 0: + series = pd.Series(series.iloc[0], index=series.index) + else: + series = series.shift(self.N) # copy + return series + + def get_period_offset(self, cur_index): + if self.N == 0: + return np.inf + return self.feature.get_period_offset(cur_index) + self.N + +class PMean(PRolling): + """PRolling PMean (MA) + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with rolling average + """ + + def __init__(self, feature, N): + super(PMean, self).__init__(feature, N, "mean") + + +class PSum(PRolling): + """PRolling PSum + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with rolling sum + """ + + def __init__(self, feature, N): + super(PSum, self).__init__(feature, N, "sum") + + +class PStd(PRolling): + """PRolling PStd + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with rolling std + """ + + def __init__(self, feature, N): + super(PStd, self).__init__(feature, N, "std") + + +class PVar(PRolling): + """PRolling Variance + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with rolling variance + """ + + def __init__(self, feature, N): + super(PVar, self).__init__(feature, N, "var") + + +class PSkew(PRolling): + """PRolling Skewness + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with rolling skewness + """ + + def __init__(self, feature, N): + if N != 0 and N < 3: + raise ValueError("The rolling window size of Skewness operation should >= 3") + super(PSkew, self).__init__(feature, N, "skew") + + +class PKurt(PRolling): + """PRolling Kurtosis + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with rolling kurtosis + """ + + def __init__(self, feature, N): + if N != 0 and N < 4: + raise ValueError("The rolling window size of Kurtosis operation should >= 5") + super(PKurt, self).__init__(feature, N, "kurt") + + +class PMax(PRolling): + """PRolling PMax + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with rolling max + """ + + def __init__(self, feature, N): + super(PMax, self).__init__(feature, N, "max") + + +class PIdxMax(PRolling): + """PRolling PMax Index + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with rolling max index + """ + + def __init__(self, feature, N): + super(PIdxMax, self).__init__(feature, N, "idxmax") + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + if self.N == 0: + series = series.expanding(min_periods=1).apply(lambda x: x.argmax() + 1, raw=True) + else: + series = series.rolling(self.N, min_periods=1).apply(lambda x: x.argmax() + 1, raw=True) + return series + + +class PMin(PRolling): + """PRolling PMin + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with rolling min + """ + + def __init__(self, feature, N): + super(PMin, self).__init__(feature, N, "min") + + +class PIdxMin(PRolling): + """PRolling PMin Index + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with rolling min index + """ + + def __init__(self, feature, N): + super(PIdxMin, self).__init__(feature, N, "idxmin") + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + if self.N == 0: + series = series.expanding(min_periods=1).apply(lambda x: x.argmin() + 1, raw=True) + else: + series = series.rolling(self.N, min_periods=1).apply(lambda x: x.argmin() + 1, raw=True) + return series + + +class PQuantile(PRolling): + """PRolling PQuantile + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with rolling quantile + """ + + def __init__(self, feature, N, qscore): + super(PQuantile, self).__init__(feature, N, "quantile") + self.qscore = qscore + + def __str__(self): + return "{}({},{},{})".format(type(self).__name__, self.feature, self.N, self.qscore) + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + if self.N == 0: + series = series.expanding(min_periods=1).quantile(self.qscore) + else: + series = series.rolling(self.N, min_periods=1).quantile(self.qscore) + return series + + +class PMed(PRolling): + """PRolling Median + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with rolling median + """ + + def __init__(self, feature, N): + super(PMed, self).__init__(feature, N, "median") + + +class PMad(PRolling): + """PRolling PMean Absolute Deviation + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with rolling mean absolute deviation + """ + + def __init__(self, feature, N): + super(PMad, self).__init__(feature, N, "mad") + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + # TODO: implement in Cython + + def mad(x): + x1 = x[~np.isnan(x)] + return np.mean(np.abs(x1 - x1.mean())) + + if self.N == 0: + series = series.expanding(min_periods=1).apply(mad, raw=True) + else: + series = series.rolling(self.N, min_periods=1).apply(mad, raw=True) + return series + + +class PRank(PRolling): + """PRolling PRank (Percentile) + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with rolling rank + """ + + def __init__(self, feature, N): + super(PRank, self).__init__(feature, N, "rank") + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + # TODO: implement in Cython + + def rank(x): + if np.isnan(x[-1]): + return np.nan + x1 = x[~np.isnan(x)] + if x1.shape[0] == 0: + return np.nan + return percentileofscore(x1, x1[-1]) / len(x1) + + if self.N == 0: + series = series.expanding(min_periods=1).apply(rank, raw=True) + else: + series = series.rolling(self.N, min_periods=1).apply(rank, raw=True) + return series + + +class PCount(PRolling): + """PRolling PCount + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with rolling count of number of non-NaN elements + """ + + def __init__(self, feature, N): + super(PCount, self).__init__(feature, N, "count") + + +class PDelta(PRolling): + """PRolling PDelta + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with end minus start in rolling window + """ + + def __init__(self, feature, N): + super(PDelta, self).__init__(feature, N, "delta") + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + if self.N == 0: + series = series - series.iloc[0] + else: + series = series - series.shift(self.N) + return series + + +# TODO: +# support pair-wise rolling like `PSlope(A, B, N)` +class PSlope(PRolling): + """PRolling PSlope + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with linear regression slope of given window + """ + + def __init__(self, feature, N): + super(PSlope, self).__init__(feature, N, "slope") + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + if self.N == 0: + series = pd.Series(expanding_slope(series.values), index=series.index) + else: + series = pd.Series(rolling_slope(series.values, self.N), index=series.index) + return series + + +class PRsquare(PRolling): + """PRolling R-value Square + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with linear regression r-value square of given window + """ + + def __init__(self, feature, N): + super(PRsquare, self).__init__(feature, N, "rsquare") + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + _series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + if self.N == 0: + series = pd.Series(expanding_rsquare(_series.values), index=_series.index) + else: + series = pd.Series(rolling_rsquare(_series.values, self.N), index=_series.index) + series.loc[np.isclose(_series.rolling(self.N, min_periods=1).std(), 0, atol=2e-05)] = np.nan + return series + + +class PResi(PRolling): + """PRolling Regression Residuals + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with regression residuals of given window + """ + + def __init__(self, feature, N): + super(PResi, self).__init__(feature, N, "resi") + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + if self.N == 0: + series = pd.Series(expanding_resi(series.values), index=series.index) + else: + series = pd.Series(rolling_resi(series.values, self.N), index=series.index) + return series + + +class PWMA(PRolling): + """PRolling PWMA + + Parameters + ---------- + feature : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with weighted moving average output + """ + + def __init__(self, feature, N): + super(PWMA, self).__init__(feature, N, "wma") + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + # TODO: implement in Cython + + def weighted_mean(x): + w = np.arange(len(x)) + w = w / w.sum() + return np.nanmean(w * x) + + if self.N == 0: + series = series.expanding(min_periods=1).apply(weighted_mean, raw=True) + else: + series = series.rolling(self.N, min_periods=1).apply(weighted_mean, raw=True) + return series + + +class PEMA(PRolling): + """PRolling Exponential PMean (PEMA) + + Parameters + ---------- + feature : Expression + feature instance + N : int, float + rolling window size + + Returns + ---------- + Expression + a feature instance with regression r-value square of given window + """ + + def __init__(self, feature, N): + super(PEMA, self).__init__(feature, N, "ema") + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + + def exp_weighted_mean(x): + a = 1 - 2 / (1 + len(x)) + w = a ** np.arange(len(x))[::-1] + w /= w.sum() + return np.nansum(w * x) + + if self.N == 0: + series = series.expanding(min_periods=1).apply(exp_weighted_mean, raw=True) + elif 0 < self.N < 1: + series = series.ewm(alpha=self.N, min_periods=1).mean() + else: + series = series.ewm(span=self.N, min_periods=1).mean() + return series + + +#################### Pair-Wise PRolling #################### +class PairRolling(PExpressionOps): + """Pair PRolling Operator + + Parameters + ---------- + feature_left : Expression + feature instance + feature_right : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with rolling output of two input features + """ + + def __init__(self, feature_left, feature_right, N, func): + self.feature_left = feature_left + self.feature_right = feature_right + self.N = N + self.func = func + + def __str__(self): + return "{}({},{},{})".format(type(self).__name__, self.feature_left, self.feature_right, self.N) + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + series_left = self.feature_left.load_period_data(instrument, start_offset, end_offset, cur_index) + series_right = self.feature_right.load_period_data(instrument, start_offset, end_offset, cur_index) + if self.N == 0: + series = getattr(series_left.expanding(min_periods=1), self.func)(series_right) + else: + series = getattr(series_left.rolling(self.N, min_periods=1), self.func)(series_right) + return series + + def get_period_offset(self, cur_index): + if self.N == 0: + return np.inf + return ( + max(self.feature_left.get_period_offset(cur_index), self.feature_right.get_period_offset(cur_index)) + + self.N + - 1 + ) + + def get_extended_window_size(self): + if self.N == 0: + get_module_logger(self.__class__.__name__).warning( + "The PairRolling(ATTR, 0) will not be accurately calculated" + ) + return self.feature.get_extended_window_size() + else: + ll, lr = self.feature_left.get_extended_window_size() + rl, rr = self.feature_right.get_extended_window_size() + return max(ll, rl) + self.N - 1, max(lr, rr) + + +class PCorr(PairRolling): + """PRolling Correlation + + Parameters + ---------- + feature_left : Expression + feature instance + feature_right : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with rolling correlation of two input features + """ + + def __init__(self, feature_left, feature_right, N): + super(PCorr, self).__init__(feature_left, feature_right, N, "corr") + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): + res = super(PCorr, self)._load_internal(instrument, start_index, end_index, freq) + + # NOTE: Load uses MemCache, so calling load_period_data again will not cause performance degradation + series_left = self.feature_left.load_period_data(instrument, start_offset, end_offset, cur_index) + series_right = self.feature_right.load_period_data(instrument, start_offset, end_offset, cur_index) + res.loc[ + np.isclose(series_left.rolling(self.N, min_periods=1).std(), 0, atol=2e-05) + | np.isclose(series_right.rolling(self.N, min_periods=1).std(), 0, atol=2e-05) + ] = np.nan + return res + + +class PCov(PairRolling): + """PRolling Covariance + + Parameters + ---------- + feature_left : Expression + feature instance + feature_right : Expression + feature instance + N : int + rolling window size + + Returns + ---------- + Expression + a feature instance with rolling max of two input features + """ + + def __init__(self, feature_left, feature_right, N): + super(PCov, self).__init__(feature_left, feature_right, N, "cov") + + +OpsList = [ + PRef, + PMax, + PMin, + PSum, + PMean, + PStd, + PVar, + PSkew, + PKurt, + PMed, + PMad, + PSlope, + PRsquare, + PResi, + PRank, + PQuantile, + PCount, + PEMA, + PWMA, + PCorr, + PCov, + PDelta, + PAbs, + PSign, + PLog, + PPower, + PAdd, + PSub, + PMul, + PDiv, + PGreater, + PLess, + PAnd, + POr, + PNot, + PGt, + PGe, + PLt, + PLe, + PEq, + PNe, + PMask, + PIdxMax, + PIdxMin, + PIf, +] + + +def register_all_period_ops(C): + """register all operator""" + logger = get_module_logger("ops") + + from .base import Operators + + Operators.reset() + Operators.register(OpsList) + + if getattr(C, "custom_period_ops", None) is not None: + Operators.register(C.custom_ops) + logger.debug("register custom period operator {}".format(C.custom_ops)) diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 6640dae2c2..6a8fdc415b 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -162,7 +162,7 @@ def parse_field(field): # - $open+$close -> Feature("open")+Feature("close") if not isinstance(field, str): field = str(field) - return re.sub(r"\$(\w+)", r'Feature("\1")', re.sub(r"(\w+\s*)\(", r"Operators.\1(", field)) + return re.sub(r"\$(\w+)", r'Feature("\1")', re.sub(r"\$\$(\w+)", r'PFeature("\1")', re.sub(r"(\w+\s*)\(", r"Operators.\1(", field))) def get_module_by_module_path(module_path): From fead2438b2cc4ca7d59418e5376336f6bb88392d Mon Sep 17 00:00:00 2001 From: bxdd Date: Wed, 10 Mar 2021 01:58:40 +0900 Subject: [PATCH 02/30] black format --- qlib/data/base.py | 18 ++++++++---------- qlib/data/ops_period.py | 7 ++++--- qlib/utils/__init__.py | 6 +++++- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/qlib/data/base.py b/qlib/data/base.py index c3a74545e8..45c9dcd402 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -9,7 +9,6 @@ import pandas as pd - class Expression(abc.ABC): """Expression base class""" @@ -341,7 +340,6 @@ def __ror__(self, other): return POr(other, self) - @abc.abstractmethod def load_period_data(self, instrument, start_offset, end_offset, cur_index): raise NotImplementedError("This function must be implemented in your newly defined feature") @@ -359,15 +357,15 @@ def load(self, instrument, start_index, end_index, freq): return H["f"][args] if start_index is None or end_index is None or start_index > end_index: raise ValueError("Invalid index range: {} {}".format(start_index, end_index)) - - resample_series = pd.Series(index=pd.RangeIndex(start_index, end_index + 1), dtype='float32', name=str(self)) + + resample_series = pd.Series(index=pd.RangeIndex(start_index, end_index + 1), dtype="float32", name=str(self)) for cur_index in range(start_index, end_index + 1): start_offset, end_offset = self.get_period_offset(cur_index) resample_data[cur_index] = self.load_period_data(instrument, start_offset, end_offset, cur_index).iloc[-1] - + H["f"][args] = resample_series return resample_data - + def get_longest_back_rolling(self): return 0 @@ -376,7 +374,6 @@ def get_extended_window_size(self): class PFeature(PExpression): - def __init__(self, name=None): if name: self._name = name.lower() @@ -388,7 +385,7 @@ def __str__(self): def load_period_data(self, instrument, start_offset, end_offset, cur_index): ### Zhou Code - return pd.Series([1,2,3]) + return pd.Series([1, 2, 3]) def get_period_offset(self, cur_index): return 0 @@ -402,7 +399,8 @@ class PExpressionOps(PExpression): """ pass - + + class OpsWrapper: """Ops Wrapper""" @@ -429,4 +427,4 @@ def __getattr__(self, key): return self._ops[key] -Operators = OpsWrapper() \ No newline at end of file +Operators = OpsWrapper() diff --git a/qlib/data/ops_period.py b/qlib/data/ops_period.py index 7409c2ac15..550deafabc 100644 --- a/qlib/data/ops_period.py +++ b/qlib/data/ops_period.py @@ -31,7 +31,6 @@ class PElemOperator(PExpressionOps): - def __init__(self, feature): self.feature = feature @@ -66,6 +65,7 @@ def load_period_data(self, instrument, start_offset, end_offset, cur_index): series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) return getattr(np, self.func)(series) + class PAbs(PNpElemOperator): """Feature Absolute Value @@ -179,7 +179,7 @@ def __str__(self): return "{}({},{})".format(type(self).__name__, self.feature, self.instrument.lower()) def load_period_data(self, instrument, start_offset, end_offset, cur_index): - + return self.feature.load_period_data(self.instrument, start_offset, end_offset, cur_index) @@ -241,6 +241,7 @@ def get_period_offset(self, cur_index): right_br = 0 return max(left_br, right_br) + class PNpPairOperator(PPairOperator): """Numpy Pair-wise operator @@ -613,7 +614,6 @@ def get_period_offset(self, cur_index): return max(left_br, right_br, c_br) - #################### PRolling #################### # NOTE: methods like `rolling.mean` are optimized with cython, # and are super faster than `rolling.apply(np.mean)` @@ -703,6 +703,7 @@ def get_period_offset(self, cur_index): return np.inf return self.feature.get_period_offset(cur_index) + self.N + class PMean(PRolling): """PRolling PMean (MA) diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 6a8fdc415b..38cbdcce8a 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -162,7 +162,11 @@ def parse_field(field): # - $open+$close -> Feature("open")+Feature("close") if not isinstance(field, str): field = str(field) - return re.sub(r"\$(\w+)", r'Feature("\1")', re.sub(r"\$\$(\w+)", r'PFeature("\1")', re.sub(r"(\w+\s*)\(", r"Operators.\1(", field))) + return re.sub( + r"\$(\w+)", + r'Feature("\1")', + re.sub(r"\$\$(\w+)", r'PFeature("\1")', re.sub(r"(\w+\s*)\(", r"Operators.\1(", field)), + ) def get_module_by_module_path(module_path): From 61720c29d43efc8a8ef8ce902f12112bba593744 Mon Sep 17 00:00:00 2001 From: bxdd Date: Wed, 10 Mar 2021 16:15:00 +0900 Subject: [PATCH 03/30] add pit data read --- qlib/data/base.py | 13 +- qlib/data/data.py | 32 ++ qlib/data/ops_period.py | 762 ---------------------------------------- qlib/utils/__init__.py | 70 ++++ 4 files changed, 110 insertions(+), 767 deletions(-) diff --git a/qlib/data/base.py b/qlib/data/base.py index 45c9dcd402..2e72367b88 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -360,8 +360,8 @@ def load(self, instrument, start_index, end_index, freq): resample_series = pd.Series(index=pd.RangeIndex(start_index, end_index + 1), dtype="float32", name=str(self)) for cur_index in range(start_index, end_index + 1): - start_offset, end_offset = self.get_period_offset(cur_index) - resample_data[cur_index] = self.load_period_data(instrument, start_offset, end_offset, cur_index).iloc[-1] + start_offset = self.get_period_offset(cur_index) + resample_data[cur_index] = self.load_period_data(instrument, start_offset, 0, cur_index).iloc[-1] H["f"][args] = resample_series return resample_data @@ -381,14 +381,17 @@ def __init__(self, name=None): self._name = type(self).__name__.lower() def __str__(self): - return "$" + self._name + return "$$" + self._name def load_period_data(self, instrument, start_offset, end_offset, cur_index): ### Zhou Code - return pd.Series([1, 2, 3]) + from .data import FeatureD + + return FeatureD.period_feature(instrument, str(self), start_offset, end_offset, cur_index) + # return pd.Series([1, 2, 3]) # fot test def get_period_offset(self, cur_index): - return 0 + return 0, 0 class PExpressionOps(PExpression): diff --git a/qlib/data/data.py b/qlib/data/data.py index bce173eeaa..ba66b75253 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -632,6 +632,14 @@ def _uri_data(self): """Static feature file uri.""" return os.path.join(C.get_data_path(), "features", "{}", "{}.{}.bin") + @property + def _uri_period_index(self): + return os.path.join(C.get_data_path(), "financial", "{}", "{}.index") + + @property + def _uri_period_data(self): + return os.path.join(C.get_data_path(), "financial", "{}", "{}.data") + def feature(self, instrument, field, start_index, end_index, freq): # validate field = str(field).lower()[1:] @@ -645,6 +653,30 @@ def feature(self, instrument, field, start_index, end_index, freq): series = read_bin(uri_data, start_index, end_index) return series + def period_feature(self, instrument, field, start_offset, end_offset, cur_index): + DATA_RECORDS = [("date", "I"), ("period", "I"), ("value", "d"), ("_next", "I")] + + NA_VALUE = float("NAN") + + field = str(field).lower()[2:] + instrument = code_to_fname(instrument) + if not field.startswith("q_") and not field.startswith("a_"): + raise ValueError("period field must start with 'q_' or 'a_'") + quarterly = field.startswith("q_") + index_path = sself._uri_period_index.format(instrument.lower(), field) + data_path = self._uri_period_data.format(instrument.lower(), field) + + data = np.fromfile(data_file, dtype=DATA_RECORDS) + # find all revision periods before `cur_date` + loc = np.searchsorted(data["date"], cur_date, side="left") + if loc <= 0: + return NA_VALUE + last_period = data["period"][loc - start_offset : loc - end_offset].max() # return the latest quarter + first_period = data["period"][loc - start_offset : loc - end_offset].min() + + series = read_period_interval_data(index_path, data_path, last_period, first_period, cur_index, quarterly) + return series + class LocalExpressionProvider(ExpressionProvider): """Local expression data provider class diff --git a/qlib/data/ops_period.py b/qlib/data/ops_period.py index 550deafabc..c33076893b 100644 --- a/qlib/data/ops_period.py +++ b/qlib/data/ops_period.py @@ -42,21 +42,6 @@ def get_period_offset(self, cur_index): class PNpElemOperator(PElemOperator): - """Numpy Element-wise Operator - - Parameters - ---------- - feature : Expression - feature instance - func : str - numpy feature operation method - - Returns - ---------- - Expression - feature operation output - """ - def __init__(self, feature, func): self.func = func super(PNpElemOperator, self).__init__(feature) @@ -67,37 +52,11 @@ def load_period_data(self, instrument, start_offset, end_offset, cur_index): class PAbs(PNpElemOperator): - """Feature Absolute Value - - Parameters - ---------- - feature : Expression - feature instance - - Returns - ---------- - Expression - a feature instance with absolute output - """ - def __init__(self, feature): super(PAbs, self).__init__(feature, "abs") class PSign(PNpElemOperator): - """Feature PSign - - Parameters - ---------- - feature : Expression - feature instance - - Returns - ---------- - Expression - a feature instance with sign - """ - def __init__(self, feature): super(PSign, self).__init__(feature, "sign") @@ -112,37 +71,11 @@ def load_period_data(self, instrument, start_offset, end_offset, cur_index): class PLog(PNpElemOperator): - """Feature PLog - - Parameters - ---------- - feature : Expression - feature instance - - Returns - ---------- - Expression - a feature instance with log - """ - def __init__(self, feature): super(PLog, self).__init__(feature, "log") class PPower(PNpElemOperator): - """Feature PPower - - Parameters - ---------- - feature : Expression - feature instance - - Returns - ---------- - Expression - a feature instance with power - """ - def __init__(self, feature, exponent): super(PPower, self).__init__(feature, "power") self.exponent = exponent @@ -156,21 +89,6 @@ def load_period_data(self, instrument, start_offset, end_offset, cur_index): class PMask(PNpElemOperator): - """Feature PMask - - Parameters - ---------- - feature : Expression - feature instance - instrument : str - instrument mask - - Returns - ---------- - Expression - a feature instance with masked instrument - """ - def __init__(self, feature, instrument): super(PMask, self).__init__(feature, "mask") self.instrument = instrument @@ -184,44 +102,12 @@ def load_period_data(self, instrument, start_offset, end_offset, cur_index): class PNot(PNpElemOperator): - """PNot Operator - - Parameters - ---------- - feature_left : Expression - feature instance - feature_right : Expression - feature instance - - Returns - ---------- - Feature: - feature elementwise not output - """ - def __init__(self, feature): super(PNot, self).__init__(feature, "bitwise_not") #################### Pair-Wise Operator #################### class PPairOperator(PExpressionOps): - """Pair-wise operator - - Parameters - ---------- - feature_left : Expression - feature instance or numeric value - feature_right : Expression - feature instance or numeric value - func : str - operator function - - Returns - ---------- - Feature: - two features' operation output - """ - def __init__(self, feature_left, feature_right): self.feature_left = feature_left self.feature_right = feature_right @@ -243,23 +129,6 @@ def get_period_offset(self, cur_index): class PNpPairOperator(PPairOperator): - """Numpy Pair-wise operator - - Parameters - ---------- - feature_left : Expression - feature instance or numeric value - feature_right : Expression - feature instance or numeric value - func : str - operator function - - Returns - ---------- - Feature: - two features' operation output - """ - def __init__(self, feature_left, feature_right, func): self.feature_left = feature_left self.feature_right = feature_right @@ -282,299 +151,77 @@ def load_period_data(self, instrument, start_offset, end_offset, cur_index): class PAdd(PNpPairOperator): - """PAdd Operator - - Parameters - ---------- - feature_left : Expression - feature instance - feature_right : Expression - feature instance - - Returns - ---------- - Feature: - two features' sum - """ - def __init__(self, feature_left, feature_right): super(PAdd, self).__init__(feature_left, feature_right, "add") class PSub(PNpPairOperator): - """Subtract Operator - - Parameters - ---------- - feature_left : Expression - feature instance - feature_right : Expression - feature instance - - Returns - ---------- - Feature: - two features' subtraction - """ - def __init__(self, feature_left, feature_right): super(PSub, self).__init__(feature_left, feature_right, "subtract") class PMul(PNpPairOperator): - """Multiply Operator - - Parameters - ---------- - feature_left : Expression - feature instance - feature_right : Expression - feature instance - - Returns - ---------- - Feature: - two features' product - """ - def __init__(self, feature_left, feature_right): super(PMul, self).__init__(feature_left, feature_right, "multiply") class PDiv(PNpPairOperator): - """Division Operator - - Parameters - ---------- - feature_left : Expression - feature instance - feature_right : Expression - feature instance - - Returns - ---------- - Feature: - two features' division - """ - def __init__(self, feature_left, feature_right): super(PDiv, self).__init__(feature_left, feature_right, "divide") class PGreater(PNpPairOperator): - """PGreater Operator - - Parameters - ---------- - feature_left : Expression - feature instance - feature_right : Expression - feature instance - - Returns - ---------- - Feature: - greater elements taken from the input two features - """ - def __init__(self, feature_left, feature_right): super(PGreater, self).__init__(feature_left, feature_right, "maximum") class PLess(PNpPairOperator): - """PLess Operator - - Parameters - ---------- - feature_left : Expression - feature instance - feature_right : Expression - feature instance - - Returns - ---------- - Feature: - smaller elements taken from the input two features - """ - def __init__(self, feature_left, feature_right): super(PLess, self).__init__(feature_left, feature_right, "minimum") class PGt(PNpPairOperator): - """PGreater Than Operator - - Parameters - ---------- - feature_left : Expression - feature instance - feature_right : Expression - feature instance - - Returns - ---------- - Feature: - bool series indicate `left > right` - """ - def __init__(self, feature_left, feature_right): super(PGt, self).__init__(feature_left, feature_right, "greater") class PGe(PNpPairOperator): - """PGreater Equal Than Operator - - Parameters - ---------- - feature_left : Expression - feature instance - feature_right : Expression - feature instance - - Returns - ---------- - Feature: - bool series indicate `left >= right` - """ - def __init__(self, feature_left, feature_right): super(PGe, self).__init__(feature_left, feature_right, "greater_equal") class PLt(PNpPairOperator): - """PLess Than Operator - - Parameters - ---------- - feature_left : Expression - feature instance - feature_right : Expression - feature instance - - Returns - ---------- - Feature: - bool series indicate `left < right` - """ - def __init__(self, feature_left, feature_right): super(PLt, self).__init__(feature_left, feature_right, "less") class PLe(PNpPairOperator): - """PLess Equal Than Operator - - Parameters - ---------- - feature_left : Expression - feature instance - feature_right : Expression - feature instance - - Returns - ---------- - Feature: - bool series indicate `left <= right` - """ - def __init__(self, feature_left, feature_right): super(PLe, self).__init__(feature_left, feature_right, "less_equal") class PEq(PNpPairOperator): - """Equal Operator - - Parameters - ---------- - feature_left : Expression - feature instance - feature_right : Expression - feature instance - - Returns - ---------- - Feature: - bool series indicate `left == right` - """ - def __init__(self, feature_left, feature_right): super(PEq, self).__init__(feature_left, feature_right, "equal") class PNe(PNpPairOperator): - """PNot Equal Operator - - Parameters - ---------- - feature_left : Expression - feature instance - feature_right : Expression - feature instance - - Returns - ---------- - Feature: - bool series indicate `left != right` - """ - def __init__(self, feature_left, feature_right): super(PNe, self).__init__(feature_left, feature_right, "not_equal") class PAnd(PNpPairOperator): - """PAnd Operator - - Parameters - ---------- - feature_left : Expression - feature instance - feature_right : Expression - feature instance - - Returns - ---------- - Feature: - two features' row by row & output - """ - def __init__(self, feature_left, feature_right): super(PAnd, self).__init__(feature_left, feature_right, "bitwise_and") class POr(PNpPairOperator): - """POr Operator - - Parameters - ---------- - feature_left : Expression - feature instance - feature_right : Expression - feature instance - - Returns - ---------- - Feature: - two features' row by row | outputs - """ - def __init__(self, feature_left, feature_right): super(POr, self).__init__(feature_left, feature_right, "bitwise_or") #################### Triple-wise Operator #################### class PIf(PExpressionOps): - """PIf Operator - - Parameters - ---------- - condition : Expression - feature instance with bool values as condition - feature_left : Expression - feature instance - feature_right : Expression - feature instance - """ - def __init__(self, condition, feature_left, feature_right): self.condition = condition self.feature_left = feature_left @@ -620,23 +267,6 @@ def get_period_offset(self, cur_index): class PRolling(PExpressionOps): - """PRolling Operator - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - func : str - rolling method - - Returns - ---------- - Expression - rolling outputs - """ - def __init__(self, feature, N, func): self.feature = feature self.N = N @@ -669,21 +299,6 @@ def get_period_offset(self, cur_index): class PRef(PRolling): - """Feature Reference - - Parameters - ---------- - feature : Expression - feature instance - N : int - N = 0, retrieve the first data; N > 0, retrieve data of N periods ago; N < 0, future data - - Returns - ---------- - Expression - a feature instance with target reference - """ - def __init__(self, feature, N): super(PRef, self).__init__(feature, N, "ref") @@ -705,101 +320,26 @@ def get_period_offset(self, cur_index): class PMean(PRolling): - """PRolling PMean (MA) - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with rolling average - """ - def __init__(self, feature, N): super(PMean, self).__init__(feature, N, "mean") class PSum(PRolling): - """PRolling PSum - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with rolling sum - """ - def __init__(self, feature, N): super(PSum, self).__init__(feature, N, "sum") class PStd(PRolling): - """PRolling PStd - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with rolling std - """ - def __init__(self, feature, N): super(PStd, self).__init__(feature, N, "std") class PVar(PRolling): - """PRolling Variance - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with rolling variance - """ - def __init__(self, feature, N): super(PVar, self).__init__(feature, N, "var") class PSkew(PRolling): - """PRolling Skewness - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with rolling skewness - """ - def __init__(self, feature, N): if N != 0 and N < 3: raise ValueError("The rolling window size of Skewness operation should >= 3") @@ -807,21 +347,6 @@ def __init__(self, feature, N): class PKurt(PRolling): - """PRolling Kurtosis - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with rolling kurtosis - """ - def __init__(self, feature, N): if N != 0 and N < 4: raise ValueError("The rolling window size of Kurtosis operation should >= 5") @@ -829,41 +354,11 @@ def __init__(self, feature, N): class PMax(PRolling): - """PRolling PMax - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with rolling max - """ - def __init__(self, feature, N): super(PMax, self).__init__(feature, N, "max") class PIdxMax(PRolling): - """PRolling PMax Index - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with rolling max index - """ - def __init__(self, feature, N): super(PIdxMax, self).__init__(feature, N, "idxmax") @@ -877,41 +372,11 @@ def load_period_data(self, instrument, start_offset, end_offset, cur_index): class PMin(PRolling): - """PRolling PMin - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with rolling min - """ - def __init__(self, feature, N): super(PMin, self).__init__(feature, N, "min") class PIdxMin(PRolling): - """PRolling PMin Index - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with rolling min index - """ - def __init__(self, feature, N): super(PIdxMin, self).__init__(feature, N, "idxmin") @@ -925,21 +390,6 @@ def load_period_data(self, instrument, start_offset, end_offset, cur_index): class PQuantile(PRolling): - """PRolling PQuantile - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with rolling quantile - """ - def __init__(self, feature, N, qscore): super(PQuantile, self).__init__(feature, N, "quantile") self.qscore = qscore @@ -957,41 +407,11 @@ def load_period_data(self, instrument, start_offset, end_offset, cur_index): class PMed(PRolling): - """PRolling Median - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with rolling median - """ - def __init__(self, feature, N): super(PMed, self).__init__(feature, N, "median") class PMad(PRolling): - """PRolling PMean Absolute Deviation - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with rolling mean absolute deviation - """ - def __init__(self, feature, N): super(PMad, self).__init__(feature, N, "mad") @@ -1011,21 +431,6 @@ def mad(x): class PRank(PRolling): - """PRolling PRank (Percentile) - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with rolling rank - """ - def __init__(self, feature, N): super(PRank, self).__init__(feature, N, "rank") @@ -1049,41 +454,11 @@ def rank(x): class PCount(PRolling): - """PRolling PCount - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with rolling count of number of non-NaN elements - """ - def __init__(self, feature, N): super(PCount, self).__init__(feature, N, "count") class PDelta(PRolling): - """PRolling PDelta - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with end minus start in rolling window - """ - def __init__(self, feature, N): super(PDelta, self).__init__(feature, N, "delta") @@ -1099,21 +474,6 @@ def load_period_data(self, instrument, start_offset, end_offset, cur_index): # TODO: # support pair-wise rolling like `PSlope(A, B, N)` class PSlope(PRolling): - """PRolling PSlope - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with linear regression slope of given window - """ - def __init__(self, feature, N): super(PSlope, self).__init__(feature, N, "slope") @@ -1127,21 +487,6 @@ def load_period_data(self, instrument, start_offset, end_offset, cur_index): class PRsquare(PRolling): - """PRolling R-value Square - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with linear regression r-value square of given window - """ - def __init__(self, feature, N): super(PRsquare, self).__init__(feature, N, "rsquare") @@ -1156,21 +501,6 @@ def load_period_data(self, instrument, start_offset, end_offset, cur_index): class PResi(PRolling): - """PRolling Regression Residuals - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with regression residuals of given window - """ - def __init__(self, feature, N): super(PResi, self).__init__(feature, N, "resi") @@ -1184,21 +514,6 @@ def load_period_data(self, instrument, start_offset, end_offset, cur_index): class PWMA(PRolling): - """PRolling PWMA - - Parameters - ---------- - feature : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with weighted moving average output - """ - def __init__(self, feature, N): super(PWMA, self).__init__(feature, N, "wma") @@ -1219,21 +534,6 @@ def weighted_mean(x): class PEMA(PRolling): - """PRolling Exponential PMean (PEMA) - - Parameters - ---------- - feature : Expression - feature instance - N : int, float - rolling window size - - Returns - ---------- - Expression - a feature instance with regression r-value square of given window - """ - def __init__(self, feature, N): super(PEMA, self).__init__(feature, N, "ema") @@ -1257,23 +557,6 @@ def exp_weighted_mean(x): #################### Pair-Wise PRolling #################### class PairRolling(PExpressionOps): - """Pair PRolling Operator - - Parameters - ---------- - feature_left : Expression - feature instance - feature_right : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with rolling output of two input features - """ - def __init__(self, feature_left, feature_right, N, func): self.feature_left = feature_left self.feature_right = feature_right @@ -1301,36 +584,8 @@ def get_period_offset(self, cur_index): - 1 ) - def get_extended_window_size(self): - if self.N == 0: - get_module_logger(self.__class__.__name__).warning( - "The PairRolling(ATTR, 0) will not be accurately calculated" - ) - return self.feature.get_extended_window_size() - else: - ll, lr = self.feature_left.get_extended_window_size() - rl, rr = self.feature_right.get_extended_window_size() - return max(ll, rl) + self.N - 1, max(lr, rr) - class PCorr(PairRolling): - """PRolling Correlation - - Parameters - ---------- - feature_left : Expression - feature instance - feature_right : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with rolling correlation of two input features - """ - def __init__(self, feature_left, feature_right, N): super(PCorr, self).__init__(feature_left, feature_right, N, "corr") @@ -1348,23 +603,6 @@ def load_period_data(self, instrument, start_offset, end_offset, cur_index): class PCov(PairRolling): - """PRolling Covariance - - Parameters - ---------- - feature_left : Expression - feature instance - feature_right : Expression - feature instance - N : int - rolling window size - - Returns - ---------- - Expression - a feature instance with rolling max of two input features - """ - def __init__(self, feature_left, feature_right, N): super(PCov, self).__init__(feature_left, feature_right, N, "cov") diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 38cbdcce8a..bebfd1d703 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -13,6 +13,7 @@ import redis import bisect import shutil +import struct import difflib import hashlib import datetime @@ -55,6 +56,75 @@ def read_bin(file_path, start_index, end_index): return series +def read_period_interval_data(index_path, data_path, last_period, first_period, cur_index, quarterly): + + INDEX_DTYPE = "I" # unsigned int32 + DATE_DTYPE = "I" + PERIOD_DTYPE = "I" + VALUE_DTYPE = "d" # float64 + + INDEX_DTYPE_SIZE = struct.calcsize(INDEX_DTYPE) + DATE_DTYPE_SIZE = struct.calcsize(DATE_DTYPE) + PERIOD_DTYPE_SIZE = struct.calcsize(PERIOD_DTYPE) + VALUE_DTYPE_SIZE = struct.calcsize(VALUE_DTYPE) + + DATA_RECORDS = [("date", DATE_DTYPE), ("period", PERIOD_DTYPE), ("value", VALUE_DTYPE), ("_next", INDEX_DTYPE)] + + DATA_DTYPE = "".join([v for k, v in DATA_RECORDS]) + DATA_DTYPE_SIZE = struct.calcsize(DATA_DTYPE) + + NA_DATE = 0 + NA_PERIOD = 0 + NA_INDEX = 0xFFFFFFFF + NA_VALUE = float("NAN") + + def get_period_list(first, last, quarterly): + if not quarterly: + assert all(1900 <= x <= 2099 for x in (first, last)), "invalid arguments" + return list(range(first, last + 1)) + assert all(190000 <= x <= 209904 for x in (first, last)), "invalid arguments" + res = [] + for year in range(first // 100, last // 100 + 1): + for q in range(1, 5): + period = year * 100 + q + if first <= period <= last: + res.append(year * 100 + q) + return res + + period_list = get_period_list(first_period, last_period, quarterly) + value = np.empty(len(period_list), dtype=VALUE_DTYPE) + + def read_period_data(index_path, data_path, period, cur_date, quarterly): + def get_period_offset(first_year, period, quarterly): + if quarterly: + offset = (period // 100 - first_year) * 4 + period % 100 - 1 + else: + offset = period - first_year + return offset + + with open(index_path, "rb") as fi: + (first_year,) = struct.unpack(PERIOD_DTYPE, fi.read(PERIOD_DTYPE_SIZE)) + all_periods = np.fromfile(fi, dtype=INDEX_DTYPE) + # find the first index of linked revisions + offset = get_period_offset(first_year, period, quarterly) + _next = all_periods[offset] + # load data following the `_next` link + prev_value = NA_VALUE + with open(data_path, "rb") as fd: + while _next != NA_INDEX: + fd.seek(_next) + date, period, value, _next = struct.unpack(DATA_DTYPE, fd.read(DATA_DTYPE_SIZE)) + if date >= cur_date: # NOTE: only use after published date + break + prev_value = value + return prev_value + + for i, period in enumerate(period_list): + value[i] = read_period_data(index_path, data_path, period, cur_date, quarterly) + series = pd.Series(value, index=period_list, dtype=VALUE_DTYPE) + return series + + def np_ffill(arr: np.array): """ forward fill a 1D numpy array From a0959a9623232cc4203881c0cba65266318da3ed Mon Sep 17 00:00:00 2001 From: bxdd Date: Wed, 10 Mar 2021 23:08:14 +0900 Subject: [PATCH 04/30] fix bug in period ops --- qlib/data/base.py | 52 ++++++++++++++++++++--------------------- qlib/data/ops.py | 35 +++++++++++++-------------- qlib/data/ops_period.py | 20 ++++++++-------- 3 files changed, 54 insertions(+), 53 deletions(-) diff --git a/qlib/data/base.py b/qlib/data/base.py index 2e72367b88..94c8b55e94 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -236,107 +236,107 @@ def __repr__(self): return str(self) def __gt__(self, other): - from .ops import PGt + from .ops_period import PGt return PGt(self, other) def __ge__(self, other): - from .ops import PGe + from .ops_period import PGe return PGe(self, other) def __lt__(self, other): - from .ops import PLt + from .ops_period import PLt return PLt(self, other) def __le__(self, other): - from .ops import PLe + from .ops_period import PLe return PLe(self, other) def __eq__(self, other): - from .ops import PEq + from .ops_period import PEq return PEq(self, other) def __ne__(self, other): - from .ops import PNe + from .ops_period import PNe return PNe(self, other) def __add__(self, other): - from .ops import PAdd + from .ops_period import PAdd return PAdd(self, other) def __radd__(self, other): - from .ops import PAdd + from .ops_period import PAdd return PAdd(other, self) def __sub__(self, other): - from .ops import PSub + from .ops_period import PSub return PSub(self, other) def __rsub__(self, other): - from .ops import PSub + from .ops_period import PSub return PSub(other, self) def __mul__(self, other): - from .ops import PMul + from .ops_period import PMul return PMul(self, other) def __rmul__(self, other): - from .ops import PMul + from .ops_period import PMul return PMul(self, other) def __div__(self, other): - from .ops import PDiv + from .ops_period import PDiv return PDiv(self, other) def __rdiv__(self, other): - from .ops import PDiv + from .ops_period import PDiv return PDiv(other, self) def __truediv__(self, other): - from .ops import PDiv + from .ops_period import PDiv return PDiv(self, other) def __rtruediv__(self, other): - from .ops import PDiv + from .ops_period import PDiv return PDiv(other, self) def __pow__(self, other): - from .ops import PPower + from .ops_period import PPower return PPower(self, other) def __and__(self, other): - from .ops import PAnd + from .ops_period import PAnd return PAnd(self, other) def __rand__(self, other): - from .ops import PAnd + from .ops_period import PAnd return PAnd(other, self) def __or__(self, other): - from .ops import POr + from .ops_period import POr return POr(self, other) def __ror__(self, other): - from .ops import POr + from .ops_period import POr return POr(other, self) @@ -361,10 +361,10 @@ def load(self, instrument, start_index, end_index, freq): resample_series = pd.Series(index=pd.RangeIndex(start_index, end_index + 1), dtype="float32", name=str(self)) for cur_index in range(start_index, end_index + 1): start_offset = self.get_period_offset(cur_index) - resample_data[cur_index] = self.load_period_data(instrument, start_offset, 0, cur_index).iloc[-1] + resample_series[cur_index] = self.load_period_data(instrument, start_offset, 0, cur_index).iloc[-1] H["f"][args] = resample_series - return resample_data + return resample_series def get_longest_back_rolling(self): return 0 @@ -385,10 +385,10 @@ def __str__(self): def load_period_data(self, instrument, start_offset, end_offset, cur_index): ### Zhou Code - from .data import FeatureD + # from .data import FeatureD - return FeatureD.period_feature(instrument, str(self), start_offset, end_offset, cur_index) - # return pd.Series([1, 2, 3]) # fot test + # return FeatureD.period_feature(instrument, str(self), start_offset, end_offset, cur_index) + return pd.Series([1, 2, 3]) # fot test def get_period_offset(self, cur_index): return 0, 0 diff --git a/qlib/data/ops.py b/qlib/data/ops.py index 0e698cd421..d32bfc692f 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -12,7 +12,7 @@ from scipy.stats import percentileofscore -from .base import Expression, ExpressionOps +from .base import Expression, PExpression, ExpressionOps from ..log import get_module_logger try: @@ -245,24 +245,24 @@ def __str__(self): return "{}({},{})".format(type(self).__name__, self.feature_left, self.feature_right) def get_longest_back_rolling(self): - if isinstance(self.feature_left, Expression): + if isinstance(self.feature_left, (Expression, PExpression)): left_br = self.feature_left.get_longest_back_rolling() else: left_br = 0 - if isinstance(self.feature_right, Expression): + if isinstance(self.feature_right, (Expression, PExpression)): right_br = self.feature_right.get_longest_back_rolling() else: right_br = 0 return max(left_br, right_br) def get_extended_window_size(self): - if isinstance(self.feature_left, Expression): + if isinstance(self.feature_left, (Expression, PExpression)): ll, lr = self.feature_left.get_extended_window_size() else: ll, lr = 0, 0 - if isinstance(self.feature_right, Expression): + if isinstance(self.feature_right, (Expression, PExpression)): rl, rr = self.feature_right.get_extended_window_size() else: rl, rr = 0, 0 @@ -292,14 +292,15 @@ def __init__(self, feature_left, feature_right, func): super(NpPairOperator, self).__init__(feature_left, feature_right) def _load_internal(self, instrument, start_index, end_index, freq): + print((self.feature_left, self.feature_right)) assert any( - [isinstance(self.feature_left, Expression), self.feature_right, Expression] + [isinstance(self.feature_left, (Expression, PExpression)), self.feature_right, Expression] ), "at least one of two inputs is Expression instance" - if isinstance(self.feature_left, Expression): + if isinstance(self.feature_left, (Expression, PExpression)): series_left = self.feature_left.load(instrument, start_index, end_index, freq) else: series_left = self.feature_left # numeric value - if isinstance(self.feature_right, Expression): + if isinstance(self.feature_right, (Expression, PExpression)): series_right = self.feature_right.load(instrument, start_index, end_index, freq) else: series_right = self.feature_right @@ -610,11 +611,11 @@ def __str__(self): def _load_internal(self, instrument, start_index, end_index, freq): series_cond = self.condition.load(instrument, start_index, end_index, freq) - if isinstance(self.feature_left, Expression): + if isinstance(self.feature_left, (Expression, PExpression)): series_left = self.feature_left.load(instrument, start_index, end_index, freq) else: series_left = self.feature_left - if isinstance(self.feature_right, Expression): + if isinstance(self.feature_right, (Expression, PExpression)): series_right = self.feature_right.load(instrument, start_index, end_index, freq) else: series_right = self.feature_right @@ -622,34 +623,34 @@ def _load_internal(self, instrument, start_index, end_index, freq): return series def get_longest_back_rolling(self): - if isinstance(self.feature_left, Expression): + if isinstance(self.feature_left, (Expression, PExpression)): left_br = self.feature_left.get_longest_back_rolling() else: left_br = 0 - if isinstance(self.feature_right, Expression): + if isinstance(self.feature_right, (Expression, PExpression)): right_br = self.feature_right.get_longest_back_rolling() else: right_br = 0 - if isinstance(self.condition, Expression): + if isinstance(self.condition, (Expression, PExpression)): c_br = self.condition.get_longest_back_rolling() else: c_br = 0 return max(left_br, right_br, c_br) def get_extended_window_size(self): - if isinstance(self.feature_left, Expression): + if isinstance(self.feature_left, (Expression, PExpression)): ll, lr = self.feature_left.get_extended_window_size() else: ll, lr = 0, 0 - if isinstance(self.feature_right, Expression): + if isinstance(self.feature_right, (Expression, PExpression)): rl, rr = self.feature_right.get_extended_window_size() else: rl, rr = 0, 0 - if isinstance(self.condition, Expression): + if isinstance(self.condition, (Expression, PExpression)): cl, cr = self.condition.get_extended_window_size() else: cl, cr = 0, 0 @@ -1492,7 +1493,7 @@ def register_all_ops(C): from .base import Operators - Operators.reset() + # Operators.reset() Operators.register(OpsList) if getattr(C, "custom_ops", None) is not None: diff --git a/qlib/data/ops_period.py b/qlib/data/ops_period.py index c33076893b..c94a24482c 100644 --- a/qlib/data/ops_period.py +++ b/qlib/data/ops_period.py @@ -137,13 +137,13 @@ def __init__(self, feature_left, feature_right, func): def load_period_data(self, instrument, start_offset, end_offset, cur_index): assert any( - [isinstance(self.feature_left, Expression), self.feature_right, Expression] - ), "at least one of two inputs is Expression instance" - if isinstance(self.feature_left, Expression): + [isinstance(self.feature_left, PExpression), self.feature_right, PExpression] + ), "at least one of two inputs is PExpression instance" + if isinstance(self.feature_left, PExpression): series_left = self.feature_left.load_period_data(instrument, start_offset, end_offset, cur_index) else: series_left = self.feature_left # numeric value - if isinstance(self.feature_right, Expression): + if isinstance(self.feature_right, PExpression): series_right = self.feature_right.load_period_data(instrument, start_offset, end_offset, cur_index) else: series_right = self.feature_right @@ -232,11 +232,11 @@ def __str__(self): def load_period_data(self, instrument, start_offset, end_offset, cur_index): series_cond = self.condition.load_period_data(instrument, start_offset, end_offset, cur_index) - if isinstance(self.feature_left, Expression): + if isinstance(self.feature_left, PExpression): series_left = self.feature_left.load_period_data(instrument, start_offset, end_offset, cur_index) else: series_left = self.feature_left - if isinstance(self.feature_right, Expression): + if isinstance(self.feature_right, PExpression): series_right = self.feature_right.load_period_data(instrument, start_offset, end_offset, cur_index) else: series_right = self.feature_right @@ -244,17 +244,17 @@ def load_period_data(self, instrument, start_offset, end_offset, cur_index): return series def get_period_offset(self, cur_index): - if isinstance(self.feature_left, Expression): + if isinstance(self.feature_left, PExpression): left_br = self.feature_left.get_period_offset(cur_index) else: left_br = 0 - if isinstance(self.feature_right, Expression): + if isinstance(self.feature_right, PExpression): right_br = self.feature_right.get_period_offset(cur_index) else: right_br = 0 - if isinstance(self.condition, Expression): + if isinstance(self.condition, PExpression): c_br = self.condition.get_period_offset(cur_index) else: c_br = 0 @@ -662,7 +662,7 @@ def register_all_period_ops(C): from .base import Operators - Operators.reset() + # Operators.reset() Operators.register(OpsList) if getattr(C, "custom_period_ops", None) is not None: From bd46d1466b3d493bd1a41e70f60c5bd54a606ee2 Mon Sep 17 00:00:00 2001 From: bxdd Date: Thu, 11 Mar 2021 01:34:55 +0900 Subject: [PATCH 05/30] update ops runnable --- qlib/config.py | 5 +- qlib/data/base.py | 231 +++++++++++++++++++++++++++++++--------- qlib/data/data.py | 17 ++- qlib/data/ops.py | 1 - qlib/data/ops_period.py | 16 +-- qlib/utils/__init__.py | 3 +- tests/test_PIT.py | 8 ++ 7 files changed, 203 insertions(+), 78 deletions(-) create mode 100644 tests/test_PIT.py diff --git a/qlib/config.py b/qlib/config.py index d1f8897898..1ab4bec91d 100644 --- a/qlib/config.py +++ b/qlib/config.py @@ -189,7 +189,6 @@ def set_conf_from_C(self, config_c): "region": REG_CN, ## Custom Operator "custom_ops": [], - "custom_period_ops": [], }, } @@ -297,14 +296,12 @@ def set(self, default_conf="client", **kwargs): def register(self): from .utils import init_instance_by_config - from .data.ops import register_all_ops - from .data.ops_period import register_all_period_ops + from .data.base import register_all_ops from .data.data import register_all_wrappers from .workflow import R, QlibRecorder from .workflow.utils import experiment_exit_handler register_all_ops(self) - register_all_period_ops(self) register_all_wrappers(self) # set up QlibRecorder exp_manager = init_instance_by_config(self["exp_manager"]) diff --git a/qlib/data/base.py b/qlib/data/base.py index 94c8b55e94..19eff1b895 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -7,6 +7,9 @@ import abc import pandas as pd +import numpy as np + +from ..log import get_module_logger class Expression(abc.ABC): @@ -236,109 +239,214 @@ def __repr__(self): return str(self) def __gt__(self, other): - from .ops_period import PGt + if isinstance(other, Expression): + from .ops import Gt + + return Gt(self, other) + else: + from .ops_period import PGt - return PGt(self, other) + return PGt(self, other) def __ge__(self, other): - from .ops_period import PGe + if isinstance(other, Expression): + from .ops import Ge - return PGe(self, other) + return Ge(self, other) + else: + from .ops_period import PGe + + return PGe(self, other) def __lt__(self, other): - from .ops_period import PLt + if isinstance(other, Expression): + from .ops import Lt + + return Lt(self, other) + else: + from .ops_period import PLt - return PLt(self, other) + return PLt(self, other) def __le__(self, other): - from .ops_period import PLe + if isinstance(other, Expression): + from .ops import Le - return PLe(self, other) + return Le(self, other) + else: + from .ops_period import PLe + + return PLe(self, other) def __eq__(self, other): - from .ops_period import PEq + if isinstance(other, Expression): + from .ops import Eq + + return Eq(self, other) + else: + from .ops_period import PEq - return PEq(self, other) + return PEq(self, other) def __ne__(self, other): - from .ops_period import PNe + if isinstance(other, Expression): + from .ops import Ne - return PNe(self, other) + return Ne(self, other) + else: + from .ops_period import PNe + + return PNe(self, other) def __add__(self, other): - from .ops_period import PAdd + if isinstance(other, Expression): + from .ops import Add - return PAdd(self, other) + return Add(self, other) + else: + from .ops_period import PAdd + + return PAdd(self, other) def __radd__(self, other): - from .ops_period import PAdd + if isinstance(other, Expression): + from .ops import Add + + return Add(other, self) + else: + from .ops_period import PAdd - return PAdd(other, self) + return PAdd(other, self) def __sub__(self, other): - from .ops_period import PSub + if isinstance(other, Expression): + from .ops import Sub - return PSub(self, other) + return Sub(self, other) + else: + from .ops_period import PSub + + return PSub(self, other) def __rsub__(self, other): - from .ops_period import PSub + if isinstance(other, Expression): + from .ops import Sub + + return Sub(other, self) + else: + from .ops_period import PSub - return PSub(other, self) + return PSub(other, self) def __mul__(self, other): - from .ops_period import PMul + if isinstance(other, Expression): + from .ops import Mul - return PMul(self, other) + return Mul(self, other) + else: + from .ops_period import PMul + + return PMul(self, other) def __rmul__(self, other): - from .ops_period import PMul + if isinstance(other, Expression): + from .ops import Mul + + return Mul(other, self) + else: + from .ops_period import PMul - return PMul(self, other) + return PMul(other, self) def __div__(self, other): - from .ops_period import PDiv + if isinstance(other, Expression): + from .ops import Div + + return Div(self, other) + else: + from .ops_period import PDiv - return PDiv(self, other) + return PDiv(self, other) def __rdiv__(self, other): - from .ops_period import PDiv + if isinstance(other, Expression): + from .ops import Div - return PDiv(other, self) + return Div(other, self) + else: + from .ops_period import PDiv + + return PDiv(other, self) def __truediv__(self, other): - from .ops_period import PDiv + if isinstance(other, Expression): + from .ops import Div + + return Div(self, other) + else: + from .ops_period import PDiv - return PDiv(self, other) + return PDiv(self, other) def __rtruediv__(self, other): - from .ops_period import PDiv + if isinstance(other, Expression): + from .ops import Div - return PDiv(other, self) + return Div(other, self) + else: + from .ops_period import PDiv + + return PDiv(other, self) def __pow__(self, other): - from .ops_period import PPower + if isinstance(other, Expression): + from .ops import Power + + return Power(self, other) + else: + from .ops_period import PPower - return PPower(self, other) + return PPower(self, other) def __and__(self, other): - from .ops_period import PAnd + if isinstance(other, Expression): + from .ops import And + + return And(self, other) + else: + from .ops_period import PAnd - return PAnd(self, other) + return PAnd(self, other) def __rand__(self, other): - from .ops_period import PAnd + if isinstance(other, Expression): + from .ops import And - return PAnd(other, self) + return And(other, self) + else: + from .ops_period import PAnd + + return PAnd(other, self) def __or__(self, other): - from .ops_period import POr + if isinstance(other, Expression): + from .ops import Or + + return Or(self, other) + else: + from .ops_period import POr - return POr(self, other) + return POr(self, other) def __ror__(self, other): - from .ops_period import POr + if isinstance(other, Expression): + from .ops import Or - return POr(other, self) + return Or(other, self) + else: + from .ops_period import POr + + return POr(other, self) @abc.abstractmethod def load_period_data(self, instrument, start_offset, end_offset, cur_index): @@ -358,11 +466,21 @@ def load(self, instrument, start_index, end_index, freq): if start_index is None or end_index is None or start_index > end_index: raise ValueError("Invalid index range: {} {}".format(start_index, end_index)) - resample_series = pd.Series(index=pd.RangeIndex(start_index, end_index + 1), dtype="float32", name=str(self)) + from .data import Cal + + _calendar = Cal.calendar(freq=freq) + resample_data = np.empty(end_index - start_index + 1, dtype="float32") + for cur_index in range(start_index, end_index + 1): + cur_date = _calendar[cur_index] start_offset = self.get_period_offset(cur_index) - resample_series[cur_index] = self.load_period_data(instrument, start_offset, 0, cur_index).iloc[-1] + resample_data[cur_index - start_index] = self.load_period_data(instrument, start_offset, 0, cur_date).iloc[ + -1 + ] + resample_series = pd.Series( + resample_data, index=pd.RangeIndex(start_index, end_index + 1), dtype="float32", name=str(self) + ) H["f"][args] = resample_series return resample_series @@ -385,13 +503,13 @@ def __str__(self): def load_period_data(self, instrument, start_offset, end_offset, cur_index): ### Zhou Code - # from .data import FeatureD + from .data import FeatureD - # return FeatureD.period_feature(instrument, str(self), start_offset, end_offset, cur_index) - return pd.Series([1, 2, 3]) # fot test + return FeatureD.period_feature(instrument, str(self), start_offset, end_offset, cur_index) + # return pd.Series([1, 2, 3]) # fot test def get_period_offset(self, cur_index): - return 0, 0 + return 0 class PExpressionOps(PExpression): @@ -431,3 +549,20 @@ def __getattr__(self, key): Operators = OpsWrapper() + + +def register_all_ops(C): + """register all operator""" + logger = get_module_logger("base") + + Operators.reset() + + from .ops import OpsList + from .ops_period import PeriodOpsList + + Operators.register(OpsList) + Operators.register(PeriodOpsList) + + if getattr(C, "custom_ops", None) is not None: + Operators.register(C.custom_ops) + logger.debug("register custom period operator {}".format(C.custom_ops)) diff --git a/qlib/data/data.py b/qlib/data/data.py index ba66b75253..0b9d33f681 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -20,7 +20,7 @@ from .cache import H from ..config import C from ..log import get_module_logger -from ..utils import parse_field, read_bin, hash_args, normalize_cache_fields, code_to_fname +from ..utils import parse_field, read_bin, hash_args, normalize_cache_fields, code_to_fname, read_period_interval_data from .base import Feature, PFeature, Operators from .cache import DiskDatasetCache, DiskExpressionCache from ..utils import Wrapper, init_instance_by_config, register_wrapper, get_module_by_module_path @@ -653,7 +653,7 @@ def feature(self, instrument, field, start_index, end_index, freq): series = read_bin(uri_data, start_index, end_index) return series - def period_feature(self, instrument, field, start_offset, end_offset, cur_index): + def period_feature(self, instrument, field, start_offset, end_offset, cur_date): DATA_RECORDS = [("date", "I"), ("period", "I"), ("value", "d"), ("_next", "I")] NA_VALUE = float("NAN") @@ -663,18 +663,17 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_index) if not field.startswith("q_") and not field.startswith("a_"): raise ValueError("period field must start with 'q_' or 'a_'") quarterly = field.startswith("q_") - index_path = sself._uri_period_index.format(instrument.lower(), field) + index_path = self._uri_period_index.format(instrument.lower(), field) data_path = self._uri_period_data.format(instrument.lower(), field) - - data = np.fromfile(data_file, dtype=DATA_RECORDS) + data = np.fromfile(data_path, dtype=DATA_RECORDS) # find all revision periods before `cur_date` + cur_date = int(cur_date.year) * 10000 + int(cur_date.month) * 100 + int(cur_date.day) loc = np.searchsorted(data["date"], cur_date, side="left") if loc <= 0: return NA_VALUE - last_period = data["period"][loc - start_offset : loc - end_offset].max() # return the latest quarter - first_period = data["period"][loc - start_offset : loc - end_offset].min() - - series = read_period_interval_data(index_path, data_path, last_period, first_period, cur_index, quarterly) + last_period = data["period"][loc - start_offset - 1 : loc - end_offset].max() # return the latest quarter + first_period = data["period"][loc - start_offset - 1 : loc - end_offset].min() + series = read_period_interval_data(index_path, data_path, last_period, first_period, cur_date, quarterly) return series diff --git a/qlib/data/ops.py b/qlib/data/ops.py index d32bfc692f..671f2c6c74 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -292,7 +292,6 @@ def __init__(self, feature_left, feature_right, func): super(NpPairOperator, self).__init__(feature_left, feature_right) def _load_internal(self, instrument, start_index, end_index, freq): - print((self.feature_left, self.feature_right)) assert any( [isinstance(self.feature_left, (Expression, PExpression)), self.feature_right, Expression] ), "at least one of two inputs is Expression instance" diff --git a/qlib/data/ops_period.py b/qlib/data/ops_period.py index c94a24482c..59c2402435 100644 --- a/qlib/data/ops_period.py +++ b/qlib/data/ops_period.py @@ -607,7 +607,7 @@ def __init__(self, feature_left, feature_right, N): super(PCov, self).__init__(feature_left, feature_right, N, "cov") -OpsList = [ +PeriodOpsList = [ PRef, PMax, PMin, @@ -654,17 +654,3 @@ def __init__(self, feature_left, feature_right, N): PIdxMin, PIf, ] - - -def register_all_period_ops(C): - """register all operator""" - logger = get_module_logger("ops") - - from .base import Operators - - # Operators.reset() - Operators.register(OpsList) - - if getattr(C, "custom_period_ops", None) is not None: - Operators.register(C.custom_ops) - logger.debug("register custom period operator {}".format(C.custom_ops)) diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index bebfd1d703..b93ed9d33a 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -56,7 +56,7 @@ def read_bin(file_path, start_index, end_index): return series -def read_period_interval_data(index_path, data_path, last_period, first_period, cur_index, quarterly): +def read_period_interval_data(index_path, data_path, last_period, first_period, cur_date, quarterly): INDEX_DTYPE = "I" # unsigned int32 DATE_DTYPE = "I" @@ -122,6 +122,7 @@ def get_period_offset(first_year, period, quarterly): for i, period in enumerate(period_list): value[i] = read_period_data(index_path, data_path, period, cur_date, quarterly) series = pd.Series(value, index=period_list, dtype=VALUE_DTYPE) + return series diff --git a/tests/test_PIT.py b/tests/test_PIT.py new file mode 100644 index 0000000000..8edcf01a88 --- /dev/null +++ b/tests/test_PIT.py @@ -0,0 +1,8 @@ +import qlib + +qlib.init(provider_uri="~/.qlib/qlib_data/us_data") +from qlib.data import D + +instruments = ["a1x4w7"] +fields = ["PSum($$q_taxrate, 4)/$close"] +print(D.features(instruments, fields, start_time="2020-06-01", end_time="2020-06-10", freq="day")) From 9f1cc6404368f29ce61efad1997e7f7bd034f785 Mon Sep 17 00:00:00 2001 From: bxdd Date: Thu, 11 Mar 2021 01:43:06 +0900 Subject: [PATCH 06/30] update PIT test example --- tests/test_PIT.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/test_PIT.py b/tests/test_PIT.py index 8edcf01a88..e0ed5da377 100644 --- a/tests/test_PIT.py +++ b/tests/test_PIT.py @@ -1,8 +1,6 @@ import qlib - qlib.init(provider_uri="~/.qlib/qlib_data/us_data") from qlib.data import D - -instruments = ["a1x4w7"] -fields = ["PSum($$q_taxrate, 4)/$close"] -print(D.features(instruments, fields, start_time="2020-06-01", end_time="2020-06-10", freq="day")) +instruments = ['a1x4w7'] +fields = ['PSum($$q_taxrate*$$q_totalcurrentassets, 4)/$close', "($close*$$q_taxrate)-($high*$$q_taxrate)"] +print(D.features(instruments, fields, start_time='2020-06-01', end_time='2020-06-10', freq='day')) \ No newline at end of file From 63e4895603778238f501b1994d837bcc97b1f988 Mon Sep 17 00:00:00 2001 From: bxdd Date: Thu, 11 Mar 2021 01:43:24 +0900 Subject: [PATCH 07/30] black format --- tests/test_PIT.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test_PIT.py b/tests/test_PIT.py index e0ed5da377..02644056be 100644 --- a/tests/test_PIT.py +++ b/tests/test_PIT.py @@ -1,6 +1,8 @@ import qlib + qlib.init(provider_uri="~/.qlib/qlib_data/us_data") from qlib.data import D -instruments = ['a1x4w7'] -fields = ['PSum($$q_taxrate*$$q_totalcurrentassets, 4)/$close', "($close*$$q_taxrate)-($high*$$q_taxrate)"] -print(D.features(instruments, fields, start_time='2020-06-01', end_time='2020-06-10', freq='day')) \ No newline at end of file + +instruments = ["a1x4w7"] +fields = ["PSum($$q_taxrate*$$q_totalcurrentassets, 4)/$close", "($close*$$q_taxrate)-($high*$$q_taxrate)"] +print(D.features(instruments, fields, start_time="2020-06-01", end_time="2020-06-10", freq="day")) From 88b7926cf733ac45c972e00a8c929d5773f8a89c Mon Sep 17 00:00:00 2001 From: bxdd Date: Thu, 11 Mar 2021 01:45:54 +0900 Subject: [PATCH 08/30] update PIT test --- tests/test_PIT.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_PIT.py b/tests/test_PIT.py index 02644056be..50fdf6a21c 100644 --- a/tests/test_PIT.py +++ b/tests/test_PIT.py @@ -4,5 +4,5 @@ from qlib.data import D instruments = ["a1x4w7"] -fields = ["PSum($$q_taxrate*$$q_totalcurrentassets, 4)/$close", "($close*$$q_taxrate)-($high*$$q_taxrate)"] +fields = ["PSum($$q_taxrate*$$q_totalcurrentassets, 4)/$close", "$close*$$q_taxrate-$high*$$q_taxrate"] print(D.features(instruments, fields, start_time="2020-06-01", end_time="2020-06-10", freq="day")) From a2dae5c2888bfd2314ea6225cd26ef99e5a6bfab Mon Sep 17 00:00:00 2001 From: bxdd Date: Fri, 12 Mar 2021 15:56:39 +0900 Subject: [PATCH 09/30] update tets_PIT --- tests/test_PIT.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/tests/test_PIT.py b/tests/test_PIT.py index 50fdf6a21c..e3c8e5dd72 100644 --- a/tests/test_PIT.py +++ b/tests/test_PIT.py @@ -1,8 +1,26 @@ -import qlib +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. -qlib.init(provider_uri="~/.qlib/qlib_data/us_data") +import sys +import unittest +import qlib from qlib.data import D +from qlib.tests import TestAutoData +from qlib.config import REG_US + + +class TestRegiterCustomOps(TestAutoData): + @classmethod + def setUpClass(cls) -> None: + # use default data + provider_uri = "~/.qlib/qlib_data/us_data" # target_dir + qlib.init(provider_uri=provider_uri, region=REG_US) + + def test_regiter_custom_ops(self): + instruments = ["a1x4w7"] + fields = ["PSum($$q_taxrate*$$q_totalcurrentassets, 4)/$close", "$close*$$q_taxrate-$high*$$q_taxrate"] + print(D.features(instruments, fields, start_time="2020-06-01", end_time="2020-06-10", freq="day")) + -instruments = ["a1x4w7"] -fields = ["PSum($$q_taxrate*$$q_totalcurrentassets, 4)/$close", "$close*$$q_taxrate-$high*$$q_taxrate"] -print(D.features(instruments, fields, start_time="2020-06-01", end_time="2020-06-10", freq="day")) +if __name__ == "__main__": + unittest.main() From 99db80d248d74f7a2ed118a52eed419d1c87ad93 Mon Sep 17 00:00:00 2001 From: bxdd Date: Fri, 12 Mar 2021 21:14:28 +0900 Subject: [PATCH 10/30] update code format --- qlib/config.py | 12 +++++ qlib/data/data.py | 33 ++++++++++++-- qlib/utils/__init__.py | 101 ++++++++++++++--------------------------- 3 files changed, 74 insertions(+), 72 deletions(-) diff --git a/qlib/config.py b/qlib/config.py index 1ab4bec91d..cbfde0850d 100644 --- a/qlib/config.py +++ b/qlib/config.py @@ -140,6 +140,18 @@ def set_conf_from_C(self, config_c): "default_exp_name": "Experiment", }, }, + "pit_record_type": { + "date": "I", # uint32 + "period": "I", # uint32 + "value": "d", # float64 + "index": "I", # uint32 + }, + "pit_record_nan": { + "date": 0, + "period": 0, + "value": float("NAN"), + "index": 0xFFFFFFFF, + }, } MODE_CONF = { diff --git a/qlib/data/data.py b/qlib/data/data.py index 0b9d33f681..35ede9aba3 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -20,7 +20,7 @@ from .cache import H from ..config import C from ..log import get_module_logger -from ..utils import parse_field, read_bin, hash_args, normalize_cache_fields, code_to_fname, read_period_interval_data +from ..utils import parse_field, read_bin, read_period_data, hash_args, normalize_cache_fields, code_to_fname from .base import Feature, PFeature, Operators from .cache import DiskDatasetCache, DiskExpressionCache from ..utils import Wrapper, init_instance_by_config, register_wrapper, get_module_by_module_path @@ -654,9 +654,14 @@ def feature(self, instrument, field, start_index, end_index, freq): return series def period_feature(self, instrument, field, start_offset, end_offset, cur_date): - DATA_RECORDS = [("date", "I"), ("period", "I"), ("value", "d"), ("_next", "I")] - NA_VALUE = float("NAN") + DATA_RECORDS = [ + ("date", C.pit_record_type["date"]), + ("period", C.pit_record_type["period"]), + ("value", C.pit_record_type["value"]), + ("_next", C.pit_record_type["index"]), + ] + VALUE_TYPE = C.pit_record_type["value"] field = str(field).lower()[2:] instrument = code_to_fname(instrument) @@ -666,14 +671,32 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_date): index_path = self._uri_period_index.format(instrument.lower(), field) data_path = self._uri_period_data.format(instrument.lower(), field) data = np.fromfile(data_path, dtype=DATA_RECORDS) + # find all revision periods before `cur_date` cur_date = int(cur_date.year) * 10000 + int(cur_date.month) * 100 + int(cur_date.day) loc = np.searchsorted(data["date"], cur_date, side="left") if loc <= 0: - return NA_VALUE + return C.pit_record_nan["value"] last_period = data["period"][loc - start_offset - 1 : loc - end_offset].max() # return the latest quarter first_period = data["period"][loc - start_offset - 1 : loc - end_offset].min() - series = read_period_interval_data(index_path, data_path, last_period, first_period, cur_date, quarterly) + + if not quarterly: + assert all(1900 <= x <= 2099 for x in (first_period, last_period)), "invalid arguments" + period_list = list(range(first_period, last_period + 1)) + else: + assert all(190000 <= x <= 209904 for x in (first_period, last_period)), "invalid arguments" + period_list = [] + for year in range(first_period // 100, last_period // 100 + 1): + for q in range(1, 5): + period = year * 100 + q + if first_period <= period <= last_period: + period_list.append(year * 100 + q) + + value = np.empty(len(period_list), dtype=VALUE_TYPE) + for i, period in enumerate(period_list): + value[i] = read_period_data(index_path, data_path, period, cur_date, quarterly) + + series = pd.Series(value, index=period_list, dtype=VALUE_TYPE) return series diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index b93ed9d33a..ff6d14d014 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -56,74 +56,41 @@ def read_bin(file_path, start_index, end_index): return series -def read_period_interval_data(index_path, data_path, last_period, first_period, cur_date, quarterly): - - INDEX_DTYPE = "I" # unsigned int32 - DATE_DTYPE = "I" - PERIOD_DTYPE = "I" - VALUE_DTYPE = "d" # float64 - - INDEX_DTYPE_SIZE = struct.calcsize(INDEX_DTYPE) - DATE_DTYPE_SIZE = struct.calcsize(DATE_DTYPE) - PERIOD_DTYPE_SIZE = struct.calcsize(PERIOD_DTYPE) - VALUE_DTYPE_SIZE = struct.calcsize(VALUE_DTYPE) - - DATA_RECORDS = [("date", DATE_DTYPE), ("period", PERIOD_DTYPE), ("value", VALUE_DTYPE), ("_next", INDEX_DTYPE)] - - DATA_DTYPE = "".join([v for k, v in DATA_RECORDS]) - DATA_DTYPE_SIZE = struct.calcsize(DATA_DTYPE) - - NA_DATE = 0 - NA_PERIOD = 0 - NA_INDEX = 0xFFFFFFFF - NA_VALUE = float("NAN") - - def get_period_list(first, last, quarterly): - if not quarterly: - assert all(1900 <= x <= 2099 for x in (first, last)), "invalid arguments" - return list(range(first, last + 1)) - assert all(190000 <= x <= 209904 for x in (first, last)), "invalid arguments" - res = [] - for year in range(first // 100, last // 100 + 1): - for q in range(1, 5): - period = year * 100 + q - if first <= period <= last: - res.append(year * 100 + q) - return res - - period_list = get_period_list(first_period, last_period, quarterly) - value = np.empty(len(period_list), dtype=VALUE_DTYPE) - - def read_period_data(index_path, data_path, period, cur_date, quarterly): - def get_period_offset(first_year, period, quarterly): - if quarterly: - offset = (period // 100 - first_year) * 4 + period % 100 - 1 - else: - offset = period - first_year - return offset - - with open(index_path, "rb") as fi: - (first_year,) = struct.unpack(PERIOD_DTYPE, fi.read(PERIOD_DTYPE_SIZE)) - all_periods = np.fromfile(fi, dtype=INDEX_DTYPE) - # find the first index of linked revisions - offset = get_period_offset(first_year, period, quarterly) - _next = all_periods[offset] - # load data following the `_next` link - prev_value = NA_VALUE - with open(data_path, "rb") as fd: - while _next != NA_INDEX: - fd.seek(_next) - date, period, value, _next = struct.unpack(DATA_DTYPE, fd.read(DATA_DTYPE_SIZE)) - if date >= cur_date: # NOTE: only use after published date - break - prev_value = value - return prev_value - - for i, period in enumerate(period_list): - value[i] = read_period_data(index_path, data_path, period, cur_date, quarterly) - series = pd.Series(value, index=period_list, dtype=VALUE_DTYPE) +def read_period_data(index_path, data_path, period, cur_date, quarterly): + + DATA_DTYPE = "".join( + [ + C.pit_record_type["date"], + C.pit_record_type["period"], + C.pit_record_type["value"], + C.pit_record_type["index"], + ] + ) - return series + PERIOD_TYPE = C.pit_record_type["period"] + INDEX_TYPE = C.pit_record_type["index"] + + NAN_VALUE = C.pit_record_nan["value"] + NAN_INDEX = C.pit_record_nan["index"] + + with open(index_path, "rb") as fi: + (first_year,) = struct.unpack(PERIOD_TYPE, fi.read(struct.calcsize(PERIOD_TYPE))) + all_periods = np.fromfile(fi, dtype=INDEX_TYPE) + + # find the first index of linked revisions + offset = (period // 100 - first_year) * 4 + period % 100 - 1 if quarterly else period - first_year + _next = all_periods[offset] + + # load data following the `_next` link + prev_value = NAN_VALUE + with open(data_path, "rb") as fd: + while _next != NAN_INDEX: + fd.seek(_next) + date, period, value, _next = struct.unpack(DATA_DTYPE, fd.read(struct.calcsize(DATA_DTYPE))) + if date >= cur_date: # NOTE: only use after published date + break + prev_value = value + return prev_value def np_ffill(arr: np.array): From c4bbe6b92fa1d779ad59a79d052cc03502a5e732 Mon Sep 17 00:00:00 2001 From: bxdd Date: Fri, 12 Mar 2021 22:08:18 +0900 Subject: [PATCH 11/30] add check_feature_exist --- qlib/data/base.py | 26 +++++++++++++++++++++++--- qlib/data/data.py | 8 ++++---- tests/test_PIT.py | 1 + 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/qlib/data/base.py b/qlib/data/base.py index 19eff1b895..b0b2c0db7e 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -5,10 +5,12 @@ from __future__ import division from __future__ import print_function +import os import abc import pandas as pd import numpy as np +from ..utils import code_to_fname from ..log import get_module_logger @@ -456,7 +458,17 @@ def load_period_data(self, instrument, start_offset, end_offset, cur_index): def get_period_offset(self, cur_index): raise NotImplementedError("This function must be implemented in your newly defined feature") + def check_feature_exist(self, instrument): + child_exist_list = [v.check_feature_exist(instrument) for k, v in self.__dict__.items() if isinstance(v, PExpression)] + return all(child_exist_list) + + def load(self, instrument, start_index, end_index, freq): + + if not self.check_feature_exist(instrument): + get_module_logger("base").warning(f"WARN: period data not found for {str(self)}") + return pd.Series(dtype="float32", name=str(self)) + from .cache import H # cache @@ -474,9 +486,7 @@ def load(self, instrument, start_index, end_index, freq): for cur_index in range(start_index, end_index + 1): cur_date = _calendar[cur_index] start_offset = self.get_period_offset(cur_index) - resample_data[cur_index - start_index] = self.load_period_data(instrument, start_offset, 0, cur_date).iloc[ - -1 - ] + resample_data[cur_index - start_index] = self.load_period_data(instrument, start_offset, 0, cur_date).iloc[-1] resample_series = pd.Series( resample_data, index=pd.RangeIndex(start_index, end_index + 1), dtype="float32", name=str(self) @@ -501,6 +511,16 @@ def __init__(self, name=None): def __str__(self): return "$$" + self._name + def check_feature_exist(self, instrument): + from .data import FeatureD + + instrument = code_to_fname(instrument).lower() + index_path = FeatureD.uri_period_index.format(instrument, self._name) + data_path = FeatureD.uri_period_data.format(instrument, self._name) + + return os.path.exists(index_path) and os.path.exists(data_path) + + def load_period_data(self, instrument, start_offset, end_offset, cur_index): ### Zhou Code from .data import FeatureD diff --git a/qlib/data/data.py b/qlib/data/data.py index 35ede9aba3..7ef2e40f36 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -633,11 +633,11 @@ def _uri_data(self): return os.path.join(C.get_data_path(), "features", "{}", "{}.{}.bin") @property - def _uri_period_index(self): + def uri_period_index(self): return os.path.join(C.get_data_path(), "financial", "{}", "{}.index") @property - def _uri_period_data(self): + def uri_period_data(self): return os.path.join(C.get_data_path(), "financial", "{}", "{}.data") def feature(self, instrument, field, start_index, end_index, freq): @@ -668,8 +668,8 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_date): if not field.startswith("q_") and not field.startswith("a_"): raise ValueError("period field must start with 'q_' or 'a_'") quarterly = field.startswith("q_") - index_path = self._uri_period_index.format(instrument.lower(), field) - data_path = self._uri_period_data.format(instrument.lower(), field) + index_path = self.uri_period_index.format(instrument.lower(), field) + data_path = self.uri_period_data.format(instrument.lower(), field) data = np.fromfile(data_path, dtype=DATA_RECORDS) # find all revision periods before `cur_date` diff --git a/tests/test_PIT.py b/tests/test_PIT.py index e3c8e5dd72..c831d35a6a 100644 --- a/tests/test_PIT.py +++ b/tests/test_PIT.py @@ -17,6 +17,7 @@ def setUpClass(cls) -> None: qlib.init(provider_uri=provider_uri, region=REG_US) def test_regiter_custom_ops(self): + #instruments = D.instruments(market='all') instruments = ["a1x4w7"] fields = ["PSum($$q_taxrate*$$q_totalcurrentassets, 4)/$close", "$close*$$q_taxrate-$high*$$q_taxrate"] print(D.features(instruments, fields, start_time="2020-06-01", end_time="2020-06-10", freq="day")) From 20bcf25bc1661d327b102de3acc3c2e2b7ed0cda Mon Sep 17 00:00:00 2001 From: bxdd Date: Fri, 12 Mar 2021 22:08:34 +0900 Subject: [PATCH 12/30] black format --- qlib/data/base.py | 10 ++++++---- qlib/data/data.py | 2 +- tests/test_PIT.py | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/qlib/data/base.py b/qlib/data/base.py index b0b2c0db7e..a1596572ff 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -459,10 +459,11 @@ def get_period_offset(self, cur_index): raise NotImplementedError("This function must be implemented in your newly defined feature") def check_feature_exist(self, instrument): - child_exist_list = [v.check_feature_exist(instrument) for k, v in self.__dict__.items() if isinstance(v, PExpression)] + child_exist_list = [ + v.check_feature_exist(instrument) for k, v in self.__dict__.items() if isinstance(v, PExpression) + ] return all(child_exist_list) - def load(self, instrument, start_index, end_index, freq): if not self.check_feature_exist(instrument): @@ -486,7 +487,9 @@ def load(self, instrument, start_index, end_index, freq): for cur_index in range(start_index, end_index + 1): cur_date = _calendar[cur_index] start_offset = self.get_period_offset(cur_index) - resample_data[cur_index - start_index] = self.load_period_data(instrument, start_offset, 0, cur_date).iloc[-1] + resample_data[cur_index - start_index] = self.load_period_data(instrument, start_offset, 0, cur_date).iloc[ + -1 + ] resample_series = pd.Series( resample_data, index=pd.RangeIndex(start_index, end_index + 1), dtype="float32", name=str(self) @@ -520,7 +523,6 @@ def check_feature_exist(self, instrument): return os.path.exists(index_path) and os.path.exists(data_path) - def load_period_data(self, instrument, start_offset, end_offset, cur_index): ### Zhou Code from .data import FeatureD diff --git a/qlib/data/data.py b/qlib/data/data.py index 7ef2e40f36..7e11d54b69 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -695,7 +695,7 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_date): value = np.empty(len(period_list), dtype=VALUE_TYPE) for i, period in enumerate(period_list): value[i] = read_period_data(index_path, data_path, period, cur_date, quarterly) - + series = pd.Series(value, index=period_list, dtype=VALUE_TYPE) return series diff --git a/tests/test_PIT.py b/tests/test_PIT.py index c831d35a6a..c3109667e1 100644 --- a/tests/test_PIT.py +++ b/tests/test_PIT.py @@ -17,7 +17,7 @@ def setUpClass(cls) -> None: qlib.init(provider_uri=provider_uri, region=REG_US) def test_regiter_custom_ops(self): - #instruments = D.instruments(market='all') + # instruments = D.instruments(market='all') instruments = ["a1x4w7"] fields = ["PSum($$q_taxrate*$$q_totalcurrentassets, 4)/$close", "$close*$$q_taxrate-$high*$$q_taxrate"] print(D.features(instruments, fields, start_time="2020-06-01", end_time="2020-06-10", freq="day")) From 6e23ff7b2acd732c2bc81f4cb971fdbb07c216eb Mon Sep 17 00:00:00 2001 From: bxdd Date: Fri, 12 Mar 2021 23:06:34 +0900 Subject: [PATCH 13/30] optimize the PIT Algorithm --- qlib/data/base.py | 12 +++--- qlib/data/data.py | 26 +++++++++-- qlib/data/ops_period.py | 96 +++++++++++++++++++++-------------------- qlib/utils/__init__.py | 24 +++++++---- 4 files changed, 94 insertions(+), 64 deletions(-) diff --git a/qlib/data/base.py b/qlib/data/base.py index a1596572ff..51b8f54384 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -451,7 +451,7 @@ def __ror__(self, other): return POr(other, self) @abc.abstractmethod - def load_period_data(self, instrument, start_offset, end_offset, cur_index): + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): raise NotImplementedError("This function must be implemented in your newly defined feature") @abc.abstractmethod @@ -487,9 +487,9 @@ def load(self, instrument, start_index, end_index, freq): for cur_index in range(start_index, end_index + 1): cur_date = _calendar[cur_index] start_offset = self.get_period_offset(cur_index) - resample_data[cur_index - start_index] = self.load_period_data(instrument, start_offset, 0, cur_date).iloc[ - -1 - ] + resample_data[cur_index - start_index] = self.load_period_data( + instrument, start_offset, 0, cur_date, info=(start_index, end_index, cur_index) + ).iloc[-1] resample_series = pd.Series( resample_data, index=pd.RangeIndex(start_index, end_index + 1), dtype="float32", name=str(self) @@ -523,11 +523,11 @@ def check_feature_exist(self, instrument): return os.path.exists(index_path) and os.path.exists(data_path) - def load_period_data(self, instrument, start_offset, end_offset, cur_index): + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): ### Zhou Code from .data import FeatureD - return FeatureD.period_feature(instrument, str(self), start_offset, end_offset, cur_index) + return FeatureD.period_feature(instrument, str(self), start_offset, end_offset, cur_index, **kwargs) # return pd.Series([1, 2, 3]) # fot test def get_period_offset(self, cur_index): diff --git a/qlib/data/data.py b/qlib/data/data.py index 7e11d54b69..026ffca192 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -653,7 +653,7 @@ def feature(self, instrument, field, start_index, end_index, freq): series = read_bin(uri_data, start_index, end_index) return series - def period_feature(self, instrument, field, start_offset, end_offset, cur_date): + def period_feature(self, instrument, field, start_offset, end_offset, cur_date, **kwargs): DATA_RECORDS = [ ("date", C.pit_record_type["date"]), @@ -665,6 +665,17 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_date): field = str(field).lower()[2:] instrument = code_to_fname(instrument) + + start_index, end_index, cur_index = kwargs["info"] + if cur_index == start_index: + if not hasattr(self, "all_field"): + self.all_field = [] + self.all_field.append(field) + if not hasattr(self, "period_index") is None: + self.period_index = {} + if field not in self.period_index: + self.period_index[field] = {} + if not field.startswith("q_") and not field.startswith("a_"): raise ValueError("period field must start with 'q_' or 'a_'") quarterly = field.startswith("q_") @@ -694,9 +705,18 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_date): value = np.empty(len(period_list), dtype=VALUE_TYPE) for i, period in enumerate(period_list): - value[i] = read_period_data(index_path, data_path, period, cur_date, quarterly) - + last_period_index = self.period_index.get(period) + value[i], now_period_index = read_period_data( + index_path, data_path, period, cur_date, quarterly, last_period_index + ) + self.period_index[period] = now_period_index series = pd.Series(value, index=period_list, dtype=VALUE_TYPE) + + if cur_index == end_index: + self.all_field.remove(field) + if len(self.all_field) == 0: + del self.period_index + return series diff --git a/qlib/data/ops_period.py b/qlib/data/ops_period.py index 59c2402435..b68a742f89 100644 --- a/qlib/data/ops_period.py +++ b/qlib/data/ops_period.py @@ -46,8 +46,8 @@ def __init__(self, feature, func): self.func = func super(PNpElemOperator, self).__init__(feature) - def load_period_data(self, instrument, start_offset, end_offset, cur_index): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) return getattr(np, self.func)(series) @@ -60,11 +60,11 @@ class PSign(PNpElemOperator): def __init__(self, feature): super(PSign, self).__init__(feature, "sign") - def load_period_data(self, instrument, start_offset, end_offset, cur_index): + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): """ To avoid error raised by bool type input, we transform the data into float32. """ - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) # TODO: More precision types should be configurable series = series.astype(np.float32) return getattr(np, self.func)(series) @@ -83,8 +83,8 @@ def __init__(self, feature, exponent): def __str__(self): return "{}({},{})".format(type(self).__name__, self.feature, self.exponent) - def load_period_data(self, instrument, start_offset, end_offset, cur_index): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) return getattr(np, self.func)(series, self.exponent) @@ -96,7 +96,7 @@ def __init__(self, feature, instrument): def __str__(self): return "{}({},{})".format(type(self).__name__, self.feature, self.instrument.lower()) - def load_period_data(self, instrument, start_offset, end_offset, cur_index): + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): return self.feature.load_period_data(self.instrument, start_offset, end_offset, cur_index) @@ -135,16 +135,18 @@ def __init__(self, feature_left, feature_right, func): self.func = func super(PNpPairOperator, self).__init__(feature_left, feature_right) - def load_period_data(self, instrument, start_offset, end_offset, cur_index): + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): assert any( [isinstance(self.feature_left, PExpression), self.feature_right, PExpression] ), "at least one of two inputs is PExpression instance" if isinstance(self.feature_left, PExpression): - series_left = self.feature_left.load_period_data(instrument, start_offset, end_offset, cur_index) + series_left = self.feature_left.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) else: series_left = self.feature_left # numeric value if isinstance(self.feature_right, PExpression): - series_right = self.feature_right.load_period_data(instrument, start_offset, end_offset, cur_index) + series_right = self.feature_right.load_period_data( + instrument, start_offset, end_offset, cur_index, **kwargs + ) else: series_right = self.feature_right return getattr(np, self.func)(series_left, series_right) @@ -230,14 +232,16 @@ def __init__(self, condition, feature_left, feature_right): def __str__(self): return "PIf({},{},{})".format(self.condition, self.feature_left, self.feature_right) - def load_period_data(self, instrument, start_offset, end_offset, cur_index): - series_cond = self.condition.load_period_data(instrument, start_offset, end_offset, cur_index) + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): + series_cond = self.condition.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) if isinstance(self.feature_left, PExpression): - series_left = self.feature_left.load_period_data(instrument, start_offset, end_offset, cur_index) + series_left = self.feature_left.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) else: series_left = self.feature_left if isinstance(self.feature_right, PExpression): - series_right = self.feature_right.load_period_data(instrument, start_offset, end_offset, cur_index) + series_right = self.feature_right.load_period_data( + instrument, start_offset, end_offset, cur_index, **kwargs + ) else: series_right = self.feature_right series = pd.Series(np.where(series_cond, series_left, series_right), index=series_cond.index) @@ -275,8 +279,8 @@ def __init__(self, feature, N, func): def __str__(self): return "{}({},{})".format(type(self).__name__, self.feature, self.N) - def load_period_data(self, instrument, start_offset, end_offset, cur_index): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) # NOTE: remove all null check, # now it's user's responsibility to decide whether use features in null days # isnull = series.isnull() # NOTE: isnull = NaN, inf is not null @@ -302,8 +306,8 @@ class PRef(PRolling): def __init__(self, feature, N): super(PRef, self).__init__(feature, N, "ref") - def load_period_data(self, instrument, start_offset, end_offset, cur_index): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) # N = 0, return first day if series.empty: return series # Pandas bug, see: https://github.com/pandas-dev/pandas/issues/21049 @@ -362,8 +366,8 @@ class PIdxMax(PRolling): def __init__(self, feature, N): super(PIdxMax, self).__init__(feature, N, "idxmax") - def load_period_data(self, instrument, start_offset, end_offset, cur_index): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) if self.N == 0: series = series.expanding(min_periods=1).apply(lambda x: x.argmax() + 1, raw=True) else: @@ -380,8 +384,8 @@ class PIdxMin(PRolling): def __init__(self, feature, N): super(PIdxMin, self).__init__(feature, N, "idxmin") - def load_period_data(self, instrument, start_offset, end_offset, cur_index): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) if self.N == 0: series = series.expanding(min_periods=1).apply(lambda x: x.argmin() + 1, raw=True) else: @@ -397,8 +401,8 @@ def __init__(self, feature, N, qscore): def __str__(self): return "{}({},{},{})".format(type(self).__name__, self.feature, self.N, self.qscore) - def load_period_data(self, instrument, start_offset, end_offset, cur_index): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) if self.N == 0: series = series.expanding(min_periods=1).quantile(self.qscore) else: @@ -415,8 +419,8 @@ class PMad(PRolling): def __init__(self, feature, N): super(PMad, self).__init__(feature, N, "mad") - def load_period_data(self, instrument, start_offset, end_offset, cur_index): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) # TODO: implement in Cython def mad(x): @@ -434,8 +438,8 @@ class PRank(PRolling): def __init__(self, feature, N): super(PRank, self).__init__(feature, N, "rank") - def load_period_data(self, instrument, start_offset, end_offset, cur_index): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) # TODO: implement in Cython def rank(x): @@ -462,8 +466,8 @@ class PDelta(PRolling): def __init__(self, feature, N): super(PDelta, self).__init__(feature, N, "delta") - def load_period_data(self, instrument, start_offset, end_offset, cur_index): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) if self.N == 0: series = series - series.iloc[0] else: @@ -477,8 +481,8 @@ class PSlope(PRolling): def __init__(self, feature, N): super(PSlope, self).__init__(feature, N, "slope") - def load_period_data(self, instrument, start_offset, end_offset, cur_index): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) if self.N == 0: series = pd.Series(expanding_slope(series.values), index=series.index) else: @@ -490,8 +494,8 @@ class PRsquare(PRolling): def __init__(self, feature, N): super(PRsquare, self).__init__(feature, N, "rsquare") - def load_period_data(self, instrument, start_offset, end_offset, cur_index): - _series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): + _series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) if self.N == 0: series = pd.Series(expanding_rsquare(_series.values), index=_series.index) else: @@ -504,8 +508,8 @@ class PResi(PRolling): def __init__(self, feature, N): super(PResi, self).__init__(feature, N, "resi") - def load_period_data(self, instrument, start_offset, end_offset, cur_index): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) if self.N == 0: series = pd.Series(expanding_resi(series.values), index=series.index) else: @@ -517,8 +521,8 @@ class PWMA(PRolling): def __init__(self, feature, N): super(PWMA, self).__init__(feature, N, "wma") - def load_period_data(self, instrument, start_offset, end_offset, cur_index): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) # TODO: implement in Cython def weighted_mean(x): @@ -537,8 +541,8 @@ class PEMA(PRolling): def __init__(self, feature, N): super(PEMA, self).__init__(feature, N, "ema") - def load_period_data(self, instrument, start_offset, end_offset, cur_index): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index) + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): + series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) def exp_weighted_mean(x): a = 1 - 2 / (1 + len(x)) @@ -566,9 +570,9 @@ def __init__(self, feature_left, feature_right, N, func): def __str__(self): return "{}({},{},{})".format(type(self).__name__, self.feature_left, self.feature_right, self.N) - def load_period_data(self, instrument, start_offset, end_offset, cur_index): - series_left = self.feature_left.load_period_data(instrument, start_offset, end_offset, cur_index) - series_right = self.feature_right.load_period_data(instrument, start_offset, end_offset, cur_index) + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): + series_left = self.feature_left.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) + series_right = self.feature_right.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) if self.N == 0: series = getattr(series_left.expanding(min_periods=1), self.func)(series_right) else: @@ -589,12 +593,12 @@ class PCorr(PairRolling): def __init__(self, feature_left, feature_right, N): super(PCorr, self).__init__(feature_left, feature_right, N, "corr") - def load_period_data(self, instrument, start_offset, end_offset, cur_index): + def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): res = super(PCorr, self)._load_internal(instrument, start_index, end_index, freq) # NOTE: Load uses MemCache, so calling load_period_data again will not cause performance degradation - series_left = self.feature_left.load_period_data(instrument, start_offset, end_offset, cur_index) - series_right = self.feature_right.load_period_data(instrument, start_offset, end_offset, cur_index) + series_left = self.feature_left.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) + series_right = self.feature_right.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) res.loc[ np.isclose(series_left.rolling(self.N, min_periods=1).std(), 0, atol=2e-05) | np.isclose(series_right.rolling(self.N, min_periods=1).std(), 0, atol=2e-05) diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index ff6d14d014..fbd01c716e 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -56,7 +56,7 @@ def read_bin(file_path, start_index, end_index): return series -def read_period_data(index_path, data_path, period, cur_date, quarterly): +def read_period_data(index_path, data_path, period, cur_date, quarterly, last_period_index): DATA_DTYPE = "".join( [ @@ -73,24 +73,30 @@ def read_period_data(index_path, data_path, period, cur_date, quarterly): NAN_VALUE = C.pit_record_nan["value"] NAN_INDEX = C.pit_record_nan["index"] - with open(index_path, "rb") as fi: - (first_year,) = struct.unpack(PERIOD_TYPE, fi.read(struct.calcsize(PERIOD_TYPE))) - all_periods = np.fromfile(fi, dtype=INDEX_TYPE) - # find the first index of linked revisions - offset = (period // 100 - first_year) * 4 + period % 100 - 1 if quarterly else period - first_year - _next = all_periods[offset] + if last_period_index is None: + with open(index_path, "rb") as fi: + (first_year,) = struct.unpack(PERIOD_TYPE, fi.read(struct.calcsize(PERIOD_TYPE))) + all_periods = np.fromfile(fi, dtype=INDEX_TYPE) + offset = (period // 100 - first_year) * 4 + period % 100 - 1 if quarterly else period - first_year + _next = all_periods[offset] + else: + _next = last_period_index # load data following the `_next` link prev_value = NAN_VALUE + prev_next = _next + with open(data_path, "rb") as fd: while _next != NAN_INDEX: fd.seek(_next) - date, period, value, _next = struct.unpack(DATA_DTYPE, fd.read(struct.calcsize(DATA_DTYPE))) + date, period, value, new_next = struct.unpack(DATA_DTYPE, fd.read(struct.calcsize(DATA_DTYPE))) if date >= cur_date: # NOTE: only use after published date break + prev_next = _next + _next = new_next prev_value = value - return prev_value + return prev_value, prev_next def np_ffill(arr: np.array): From 88a0d3dfdd082fecbd5f12547239657ac17188ef Mon Sep 17 00:00:00 2001 From: bxdd Date: Sat, 13 Mar 2021 01:24:15 +0900 Subject: [PATCH 14/30] fix bug --- qlib/data/data.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/qlib/data/data.py b/qlib/data/data.py index 026ffca192..264509e03f 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -668,10 +668,10 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_date, start_index, end_index, cur_index = kwargs["info"] if cur_index == start_index: - if not hasattr(self, "all_field"): - self.all_field = [] - self.all_field.append(field) - if not hasattr(self, "period_index") is None: + if not hasattr(self, "all_fields"): + self.all_fields = [] + self.all_fields.append(field) + if not hasattr(self, "period_index"): self.period_index = {} if field not in self.period_index: self.period_index[field] = {} @@ -705,16 +705,17 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_date, value = np.empty(len(period_list), dtype=VALUE_TYPE) for i, period in enumerate(period_list): - last_period_index = self.period_index.get(period) + last_period_index = self.period_index[field].get(period) value[i], now_period_index = read_period_data( index_path, data_path, period, cur_date, quarterly, last_period_index ) - self.period_index[period] = now_period_index + self.period_index[field].update({period: now_period_index}) series = pd.Series(value, index=period_list, dtype=VALUE_TYPE) if cur_index == end_index: - self.all_field.remove(field) - if len(self.all_field) == 0: + self.all_fields.remove(field) + if not len(self.all_fields): + del self.all_fields del self.period_index return series From f52462a839e1ce3b8772b5e59467974475284ead Mon Sep 17 00:00:00 2001 From: bxdd Date: Sat, 13 Mar 2021 01:39:31 +0900 Subject: [PATCH 15/30] update example --- tests/test_PIT.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/test_PIT.py b/tests/test_PIT.py index c3109667e1..a6fbcc353f 100644 --- a/tests/test_PIT.py +++ b/tests/test_PIT.py @@ -17,9 +17,17 @@ def setUpClass(cls) -> None: qlib.init(provider_uri=provider_uri, region=REG_US) def test_regiter_custom_ops(self): - # instruments = D.instruments(market='all') + instruments = ["a1x4w7"] - fields = ["PSum($$q_taxrate*$$q_totalcurrentassets, 4)/$close", "$close*$$q_taxrate-$high*$$q_taxrate"] + fields = ["$$q_accountspayable / $$q_totalcurrentassets"] + print(D.features(instruments, fields, start_time="2020-06-01", end_time="2020-06-10", freq="day")) + fields = [ + "($$q_accountspayable / $$q_totalcurrentassets) / PRef($$q_accountspayable / $$q_totalcurrentassets, 1) - 1" + ] + print(D.features(instruments, fields, start_time="2020-06-01", end_time="2020-06-10", freq="day")) + fields = ["PSum($$q_totalrevenue/$$q_totalcurrentassets, 4)"] + print(D.features(instruments, fields, start_time="2020-06-01", end_time="2020-06-10", freq="day")) + fields = ["$$q_totalcurrentassets/$close/10000"] print(D.features(instruments, fields, start_time="2020-06-01", end_time="2020-06-10", freq="day")) From b794e657209fd3a786d5b10a57368d0ad46bc12d Mon Sep 17 00:00:00 2001 From: bxdd Date: Wed, 17 Mar 2021 15:08:59 +0800 Subject: [PATCH 16/30] update test_PIT name --- tests/{test_PIT.py => notest_PIT.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{test_PIT.py => notest_PIT.py} (100%) diff --git a/tests/test_PIT.py b/tests/notest_PIT.py similarity index 100% rename from tests/test_PIT.py rename to tests/notest_PIT.py From 9df1fbd948c45850ff657c6339efaa95a95d6352 Mon Sep 17 00:00:00 2001 From: bxdd Date: Thu, 8 Apr 2021 00:20:53 +0800 Subject: [PATCH 17/30] add pit collector --- scripts/data_collector/base.py | 3 +- scripts/data_collector/pit/README.md | 23 ++ scripts/data_collector/pit/collector.py | 264 ++++++++++++++++++++ scripts/data_collector/pit/requirements.txt | 9 + 4 files changed, 298 insertions(+), 1 deletion(-) create mode 100644 scripts/data_collector/pit/README.md create mode 100644 scripts/data_collector/pit/collector.py create mode 100644 scripts/data_collector/pit/requirements.txt diff --git a/scripts/data_collector/base.py b/scripts/data_collector/base.py index 12983f6a58..5e1d488d16 100644 --- a/scripts/data_collector/base.py +++ b/scripts/data_collector/base.py @@ -326,12 +326,13 @@ def __init__(self, source_dir=None, normalize_dir=None, max_workers=4, interval= freq, value from [1min, 1d], default 1d """ if source_dir is None: - source_dir = Path(self.default_base_dir).joinpath("_source") + source_dir = Path(self.default_base_dir).joinpath("source") self.source_dir = Path(source_dir).expanduser().resolve() self.source_dir.mkdir(parents=True, exist_ok=True) if normalize_dir is None: normalize_dir = Path(self.default_base_dir).joinpath("normalize") + print(normalize_dir) self.normalize_dir = Path(normalize_dir).expanduser().resolve() self.normalize_dir.mkdir(parents=True, exist_ok=True) diff --git a/scripts/data_collector/pit/README.md b/scripts/data_collector/pit/README.md new file mode 100644 index 0000000000..864479a4df --- /dev/null +++ b/scripts/data_collector/pit/README.md @@ -0,0 +1,23 @@ +# Collect Point-in-Time Data + +> *Please pay **ATTENTION** that the data is collected from [baostock](http://baostock.com) and the data might not be perfect. We recommend users to prepare their own data if they have high-quality dataset. For more information, users can refer to the [related document](https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format)* + +## Requirements + +```bash +pip install -r requirements.txt +``` + +## Collector Data + + +### Download Quarterly CN Data + +#### 1d from East Money + +```bash + +# download from baostock.com +python collector.py download_data --source_dir ~/.qlib/cn_data/source/pit_quarter --start 2010-01-01 --end 2021-01-01 --interval quarterly + +``` diff --git a/scripts/data_collector/pit/collector.py b/scripts/data_collector/pit/collector.py new file mode 100644 index 0000000000..1cf0b01911 --- /dev/null +++ b/scripts/data_collector/pit/collector.py @@ -0,0 +1,264 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import abc +import sys +import datetime +from abc import ABC +from pathlib import Path + +import fire +import numpy as np +import pandas as pd +import baostock as bs +from loguru import logger + +CUR_DIR = Path(__file__).resolve().parent +sys.path.append(str(CUR_DIR.parent.parent)) +from data_collector.base import BaseCollector, BaseRun +from data_collector.utils import get_calendar_list, get_hs_stock_symbols + +class PitCollector(BaseCollector): + + DEFAULT_START_DATETIME_QUARTER = pd.Timestamp("2000-01-01") + DEFAULT_START_DATETIME_ANNUAL = pd.Timestamp("2000-01-01") + DEFAULT_END_DATETIME_QUARTER = pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1)) + DEFAULT_END_DATETIME_ANNUAL = pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1)) + + INTERVAL_quarter = "quarterly" + INTERVAL_annual = "annual" + + def __init__( + self, + save_dir: [str, Path], + start=None, + end=None, + interval="quarterly", + max_workers=4, + max_collector_count=2, + delay=0, + check_data_length: bool = False, + limit_nums: int = None, + ): + """ + + Parameters + ---------- + save_dir: str + pit save dir + interval: str: + value from ['quarterly', 'annual'] + max_workers: int + workers, default 4 + max_collector_count: int + default 2 + delay: float + time.sleep(delay), default 0 + start: str + start datetime, default None + end: str + end datetime, default None + limit_nums: int + using for debug, by default None + """ + super(PitCollector, self).__init__( + save_dir=save_dir, + start=start, + end=end, + interval=interval, + max_workers=max_workers, + max_collector_count=max_collector_count, + delay=delay, + limit_nums=limit_nums, + ) + + def normalize_symbol(self, symbol): + symbol_s = symbol.split(".") + symbol = f"sh{symbol_s[0]}" if symbol_s[-1] == "ss" else f"sz{symbol_s[0]}" + return symbol + + def get_instrument_list(self): + logger.info("get cn stock symbols......") + symbols = get_hs_stock_symbols() + logger.info(f"get {len(symbols)} symbols.") + return symbols + + def _get_data_from_baostock(self, symbol, interval, start_datetime, end_datetime): + error_msg = f"{symbol}-{interval}-{start_datetime}-{end_datetime}" + try: + symbol = f"{symbol[7:]}.{symbol[:6]}" + print(symbol) + rs_report = bs.query_performance_express_report(code=symbol, start_date=str(start_datetime.date()), end_date=str(end_datetime.date())) + report_list = [] + while (rs_report.error_code == '0') & rs_report.next(): + report_list.append(rs_report.get_row_data()) + # 获取一条记录,将记录合并在一起 + df_report = pd.DataFrame(report_list, columns=rs_report.fields) + if not df_report.empty: + df_report = df_report[['performanceExpPubDate', 'performanceExpStatDate', 'performanceExpressROEWa']] + df_report.rename( + columns = { + "performanceExpPubDate": "date", + "performanceExpStatDate": "period", + "performanceExpressROEWa":"value" + }, + inplace=True, + ) + df_report['value'] = df_report['value'].astype('float32') / 100.0 + df_report['field'] = "roeWa" + + profit_list = [] + for year in range(start_datetime.year - 1, end_datetime.year + 1): + for q_num in range(0, 4): + rs_profit = bs.query_profit_data(code=symbol, year=year, quarter=q_num + 1) + while (rs_profit.error_code == '0') & rs_profit.next(): + row_data = rs_profit.get_row_data() + pub_date = pd.Timestamp(row_data[rs_profit.fields.index('pubDate')]) + if pub_date >= start_datetime and pub_date <= end_datetime: + profit_list.append(row_data) + + df_profit = pd.DataFrame(profit_list, columns=rs_profit.fields) + if not df_profit.empty: + df_profit = df_profit[['pubDate', 'statDate', 'roeAvg']] + df_profit.rename( + columns = { + "pubDate": "date", + "statDate": "period", + "roeAvg":"value" + }, + inplace=True, + ) + df_profit['value'] = df_profit['value'].astype('float32') + df_profit['field'] = "roeWa" + + forecast_list = [] + rs_forecast = bs.query_forecast_report(code=symbol, start_date=str(start_datetime.date()), end_date=str(end_datetime.date())) + + while (rs_forecast.error_code == '0') & rs_forecast.next(): + forecast_list.append(rs_forecast.get_row_data()) + + df_forecast = pd.DataFrame(forecast_list, columns=rs_forecast.fields) + if not df_forecast.empty: + df_forecast = df_forecast[['profitForcastExpPubDate', 'profitForcastExpStatDate', 'profitForcastChgPctUp', 'profitForcastChgPctDwn']] + df_forecast.rename( + columns = { + "profitForcastExpPubDate": "date", + "profitForcastExpStatDate": "period", + }, + inplace=True, + ) + df_forecast['value'] = (df_forecast['profitForcastChgPctUp'].astype('float32') + df_forecast['profitForcastChgPctDwn'].astype('float32')) / 200 + df_forecast['field'] = "YOYNI" + df_forecast.drop(['profitForcastChgPctUp', 'profitForcastChgPctDwn'], axis=1, inplace=True) + + growth_list = [] + for year in range(start_datetime.year - 1, end_datetime.year + 1): + for q_num in range(0, 4): + rs_growth = bs.query_growth_data(code=symbol, year=year, quarter=q_num + 1) + while (rs_growth.error_code == '0') & rs_growth.next(): + row_data = rs_growth.get_row_data() + pub_date = pd.Timestamp(row_data[rs_growth.fields.index('pubDate')]) + if pub_date >= start_datetime and pub_date <= end_datetime: + growth_list.append(row_data) + df_growth = pd.DataFrame(growth_list, columns=rs_growth.fields)[['pubDate', 'statDate', 'YOYNI']] + if not df_growth.empty: + df_growth = df_growth[['pubDate', 'statDate', 'YOYNI']] + df_growth.rename( + columns = { + "pubDate": "date", + "statDate": "period", + "YOYNI":"value" + }, + inplace=True, + ) + df_growth['value'] = df_growth['value'].astype('float32') + df_growth['field'] = "YOYNI" + + df_merge = df_report.append([df_profit, df_forecast, df_growth]) + return df_merge + except Exception as e: + logger.warning(f"{error_msg}:{e}") + + def get_data( + self, symbol: str, interval: str, start_datetime: pd.Timestamp, end_datetime: pd.Timestamp + ) -> [pd.DataFrame]: + + if interval == self.INTERVAL_quarter: + _result = self._get_data_from_baostock(symbol, interval, start_datetime, end_datetime) + else: + raise ValueError(f"cannot support {interval}") + return _result + + @property + def min_numbers_trading(self): + pass + +class Run(BaseRun): + def __init__(self, source_dir=None, max_workers=4, interval="quarterly"): + """ + + Parameters + ---------- + source_dir: str + The directory where the raw data collected from the Internet is saved, default "Path(__file__).parent/source" + normalize_dir: str + Directory for normalize data, default "Path(__file__).parent/normalize" + max_workers: int + Concurrent number, default is 4 + interval: str + freq, value from [quarterly, annual], default 1d + """ + super().__init__(source_dir=source_dir, max_workers=max_workers, interval=interval) + + @property + def collector_class_name(self): + return "PitCollector" + + @property + def default_base_dir(self) -> [Path, str]: + return CUR_DIR + + def download_data( + self, + max_collector_count=2, + delay=0, + start=None, + end=None, + interval="quarterly", + check_data_length=False, + limit_nums=None, + ): + """download data from Internet + + Parameters + ---------- + max_collector_count: int + default 2 + delay: float + time.sleep(delay), default 0 + interval: str + freq, value from [quarterly, annual], default 1d + start: str + start datetime, default "2000-01-01" + end: str + end datetime, default ``pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))`` + check_data_length: bool # if this param useful? + check data length, by default False + limit_nums: int + using for debug, by default None + + Examples + --------- + # get quarterly data + $ python collector.py download_data --source_dir ~/.qlib/cn_data/source/pit_quarter --start 2000-01-01 --end 2021-01-01 --interval quarterly + """ + + super(Run, self).download_data(max_collector_count, delay, start, end, interval, check_data_length, limit_nums) + + def normalize_class_name(self): + pass + +if __name__ == "__main__": + bs.login() + fire.Fire(Run) + bs.logout() diff --git a/scripts/data_collector/pit/requirements.txt b/scripts/data_collector/pit/requirements.txt new file mode 100644 index 0000000000..0cd9b42f9c --- /dev/null +++ b/scripts/data_collector/pit/requirements.txt @@ -0,0 +1,9 @@ +loguru +fire +tqdm +requests +pandas +lxml +loguru +baostock +yahooquery \ No newline at end of file From 71d5640036bce2cad685a0b69a4ef50e7595b53c Mon Sep 17 00:00:00 2001 From: bxdd Date: Thu, 8 Apr 2021 00:23:48 +0800 Subject: [PATCH 18/30] black format --- scripts/data_collector/pit/collector.py | 95 ++++++++++++++----------- 1 file changed, 52 insertions(+), 43 deletions(-) diff --git a/scripts/data_collector/pit/collector.py b/scripts/data_collector/pit/collector.py index 1cf0b01911..cbe68287f8 100644 --- a/scripts/data_collector/pit/collector.py +++ b/scripts/data_collector/pit/collector.py @@ -18,6 +18,7 @@ from data_collector.base import BaseCollector, BaseRun from data_collector.utils import get_calendar_list, get_hs_stock_symbols + class PitCollector(BaseCollector): DEFAULT_START_DATETIME_QUARTER = pd.Timestamp("2000-01-01") @@ -76,103 +77,109 @@ def normalize_symbol(self, symbol): symbol_s = symbol.split(".") symbol = f"sh{symbol_s[0]}" if symbol_s[-1] == "ss" else f"sz{symbol_s[0]}" return symbol - + def get_instrument_list(self): logger.info("get cn stock symbols......") symbols = get_hs_stock_symbols() logger.info(f"get {len(symbols)} symbols.") return symbols - + def _get_data_from_baostock(self, symbol, interval, start_datetime, end_datetime): error_msg = f"{symbol}-{interval}-{start_datetime}-{end_datetime}" try: symbol = f"{symbol[7:]}.{symbol[:6]}" print(symbol) - rs_report = bs.query_performance_express_report(code=symbol, start_date=str(start_datetime.date()), end_date=str(end_datetime.date())) + rs_report = bs.query_performance_express_report( + code=symbol, start_date=str(start_datetime.date()), end_date=str(end_datetime.date()) + ) report_list = [] - while (rs_report.error_code == '0') & rs_report.next(): + while (rs_report.error_code == "0") & rs_report.next(): report_list.append(rs_report.get_row_data()) # 获取一条记录,将记录合并在一起 df_report = pd.DataFrame(report_list, columns=rs_report.fields) if not df_report.empty: - df_report = df_report[['performanceExpPubDate', 'performanceExpStatDate', 'performanceExpressROEWa']] + df_report = df_report[["performanceExpPubDate", "performanceExpStatDate", "performanceExpressROEWa"]] df_report.rename( - columns = { - "performanceExpPubDate": "date", + columns={ + "performanceExpPubDate": "date", "performanceExpStatDate": "period", - "performanceExpressROEWa":"value" - }, + "performanceExpressROEWa": "value", + }, inplace=True, ) - df_report['value'] = df_report['value'].astype('float32') / 100.0 - df_report['field'] = "roeWa" + df_report["value"] = df_report["value"].astype("float32") / 100.0 + df_report["field"] = "roeWa" profit_list = [] for year in range(start_datetime.year - 1, end_datetime.year + 1): for q_num in range(0, 4): rs_profit = bs.query_profit_data(code=symbol, year=year, quarter=q_num + 1) - while (rs_profit.error_code == '0') & rs_profit.next(): + while (rs_profit.error_code == "0") & rs_profit.next(): row_data = rs_profit.get_row_data() - pub_date = pd.Timestamp(row_data[rs_profit.fields.index('pubDate')]) + pub_date = pd.Timestamp(row_data[rs_profit.fields.index("pubDate")]) if pub_date >= start_datetime and pub_date <= end_datetime: profit_list.append(row_data) - + df_profit = pd.DataFrame(profit_list, columns=rs_profit.fields) if not df_profit.empty: - df_profit = df_profit[['pubDate', 'statDate', 'roeAvg']] + df_profit = df_profit[["pubDate", "statDate", "roeAvg"]] df_profit.rename( - columns = { - "pubDate": "date", - "statDate": "period", - "roeAvg":"value" - }, + columns={"pubDate": "date", "statDate": "period", "roeAvg": "value"}, inplace=True, ) - df_profit['value'] = df_profit['value'].astype('float32') - df_profit['field'] = "roeWa" + df_profit["value"] = df_profit["value"].astype("float32") + df_profit["field"] = "roeWa" forecast_list = [] - rs_forecast = bs.query_forecast_report(code=symbol, start_date=str(start_datetime.date()), end_date=str(end_datetime.date())) - - while (rs_forecast.error_code == '0') & rs_forecast.next(): + rs_forecast = bs.query_forecast_report( + code=symbol, start_date=str(start_datetime.date()), end_date=str(end_datetime.date()) + ) + + while (rs_forecast.error_code == "0") & rs_forecast.next(): forecast_list.append(rs_forecast.get_row_data()) df_forecast = pd.DataFrame(forecast_list, columns=rs_forecast.fields) if not df_forecast.empty: - df_forecast = df_forecast[['profitForcastExpPubDate', 'profitForcastExpStatDate', 'profitForcastChgPctUp', 'profitForcastChgPctDwn']] + df_forecast = df_forecast[ + [ + "profitForcastExpPubDate", + "profitForcastExpStatDate", + "profitForcastChgPctUp", + "profitForcastChgPctDwn", + ] + ] df_forecast.rename( - columns = { - "profitForcastExpPubDate": "date", + columns={ + "profitForcastExpPubDate": "date", "profitForcastExpStatDate": "period", - }, + }, inplace=True, ) - df_forecast['value'] = (df_forecast['profitForcastChgPctUp'].astype('float32') + df_forecast['profitForcastChgPctDwn'].astype('float32')) / 200 - df_forecast['field'] = "YOYNI" - df_forecast.drop(['profitForcastChgPctUp', 'profitForcastChgPctDwn'], axis=1, inplace=True) + df_forecast["value"] = ( + df_forecast["profitForcastChgPctUp"].astype("float32") + + df_forecast["profitForcastChgPctDwn"].astype("float32") + ) / 200 + df_forecast["field"] = "YOYNI" + df_forecast.drop(["profitForcastChgPctUp", "profitForcastChgPctDwn"], axis=1, inplace=True) growth_list = [] for year in range(start_datetime.year - 1, end_datetime.year + 1): for q_num in range(0, 4): rs_growth = bs.query_growth_data(code=symbol, year=year, quarter=q_num + 1) - while (rs_growth.error_code == '0') & rs_growth.next(): + while (rs_growth.error_code == "0") & rs_growth.next(): row_data = rs_growth.get_row_data() - pub_date = pd.Timestamp(row_data[rs_growth.fields.index('pubDate')]) + pub_date = pd.Timestamp(row_data[rs_growth.fields.index("pubDate")]) if pub_date >= start_datetime and pub_date <= end_datetime: growth_list.append(row_data) - df_growth = pd.DataFrame(growth_list, columns=rs_growth.fields)[['pubDate', 'statDate', 'YOYNI']] + df_growth = pd.DataFrame(growth_list, columns=rs_growth.fields)[["pubDate", "statDate", "YOYNI"]] if not df_growth.empty: - df_growth = df_growth[['pubDate', 'statDate', 'YOYNI']] + df_growth = df_growth[["pubDate", "statDate", "YOYNI"]] df_growth.rename( - columns = { - "pubDate": "date", - "statDate": "period", - "YOYNI":"value" - }, + columns={"pubDate": "date", "statDate": "period", "YOYNI": "value"}, inplace=True, ) - df_growth['value'] = df_growth['value'].astype('float32') - df_growth['field'] = "YOYNI" + df_growth["value"] = df_growth["value"].astype("float32") + df_growth["field"] = "YOYNI" df_merge = df_report.append([df_profit, df_forecast, df_growth]) return df_merge @@ -193,6 +200,7 @@ def get_data( def min_numbers_trading(self): pass + class Run(BaseRun): def __init__(self, source_dir=None, max_workers=4, interval="quarterly"): """ @@ -258,6 +266,7 @@ def download_data( def normalize_class_name(self): pass + if __name__ == "__main__": bs.login() fire.Fire(Run) From ebe277bcbf50ec44ee1b1beace5f3010b009db2d Mon Sep 17 00:00:00 2001 From: bxdd Date: Thu, 8 Apr 2021 15:51:16 +0800 Subject: [PATCH 19/30] fix bugs --- scripts/data_collector/base.py | 2 - scripts/data_collector/pit/README.md | 2 +- scripts/data_collector/pit/collector.py | 216 +++++++++++++----------- 3 files changed, 123 insertions(+), 97 deletions(-) diff --git a/scripts/data_collector/base.py b/scripts/data_collector/base.py index 5e1d488d16..f33f1d95f2 100644 --- a/scripts/data_collector/base.py +++ b/scripts/data_collector/base.py @@ -200,7 +200,6 @@ def _collector(self, instrument_list): if _result != self.NORMAL_FLAG: error_symbol.append(_symbol) p_bar.update() - print(error_symbol) logger.info(f"error symbol nums: {len(error_symbol)}") logger.info(f"current get symbol nums: {len(instrument_list)}") error_symbol.extend(self.mini_symbol_map.keys()) @@ -332,7 +331,6 @@ def __init__(self, source_dir=None, normalize_dir=None, max_workers=4, interval= if normalize_dir is None: normalize_dir = Path(self.default_base_dir).joinpath("normalize") - print(normalize_dir) self.normalize_dir = Path(normalize_dir).expanduser().resolve() self.normalize_dir.mkdir(parents=True, exist_ok=True) diff --git a/scripts/data_collector/pit/README.md b/scripts/data_collector/pit/README.md index 864479a4df..6936c983ce 100644 --- a/scripts/data_collector/pit/README.md +++ b/scripts/data_collector/pit/README.md @@ -18,6 +18,6 @@ pip install -r requirements.txt ```bash # download from baostock.com -python collector.py download_data --source_dir ~/.qlib/cn_data/source/pit_quarter --start 2010-01-01 --end 2021-01-01 --interval quarterly +python collector.py download_data --source_dir ~/.qlib/cn_data/source/pit_quarter --start 2010-01-01 --end 2020-01-01 --interval quarterly ``` diff --git a/scripts/data_collector/pit/collector.py b/scripts/data_collector/pit/collector.py index cbe68287f8..6b66711d57 100644 --- a/scripts/data_collector/pit/collector.py +++ b/scripts/data_collector/pit/collector.py @@ -35,8 +35,8 @@ def __init__( start=None, end=None, interval="quarterly", - max_workers=4, - max_collector_count=2, + max_workers=1, + max_collector_count=1, delay=0, check_data_length: bool = False, limit_nums: int = None, @@ -50,9 +50,9 @@ def __init__( interval: str: value from ['quarterly', 'annual'] max_workers: int - workers, default 4 + workers, default 1 max_collector_count: int - default 2 + default 1 delay: float time.sleep(delay), default 0 start: str @@ -86,105 +86,132 @@ def get_instrument_list(self): def _get_data_from_baostock(self, symbol, interval, start_datetime, end_datetime): error_msg = f"{symbol}-{interval}-{start_datetime}-{end_datetime}" - try: - symbol = f"{symbol[7:]}.{symbol[:6]}" - print(symbol) - rs_report = bs.query_performance_express_report( - code=symbol, start_date=str(start_datetime.date()), end_date=str(end_datetime.date()) + + def _str_to_float(r): + try: + return float(r) + except Exception as e: + return np.nan + + print(symbol) + symbol = f"{symbol[7:]}.{symbol[:6]}" + rs_report = bs.query_performance_express_report( + code=symbol, start_date=str(start_datetime.date()), end_date=str(end_datetime.date()) + ) + report_list = [] + while (rs_report.error_code == "0") & rs_report.next(): + report_list.append(rs_report.get_row_data()) + # 获取一条记录,将记录合并在一起 + + df_report = pd.DataFrame(report_list, columns=rs_report.fields) + if {"performanceExpPubDate", "performanceExpStatDate", "performanceExpressROEWa"} <= set(rs_report.fields): + df_report = df_report[["performanceExpPubDate", "performanceExpStatDate", "performanceExpressROEWa"]] + df_report.rename( + columns={ + "performanceExpPubDate": "date", + "performanceExpStatDate": "period", + "performanceExpressROEWa": "value", + }, + inplace=True, ) - report_list = [] - while (rs_report.error_code == "0") & rs_report.next(): - report_list.append(rs_report.get_row_data()) - # 获取一条记录,将记录合并在一起 - df_report = pd.DataFrame(report_list, columns=rs_report.fields) - if not df_report.empty: - df_report = df_report[["performanceExpPubDate", "performanceExpStatDate", "performanceExpressROEWa"]] - df_report.rename( - columns={ - "performanceExpPubDate": "date", - "performanceExpStatDate": "period", - "performanceExpressROEWa": "value", - }, - inplace=True, - ) - df_report["value"] = df_report["value"].astype("float32") / 100.0 - df_report["field"] = "roeWa" - - profit_list = [] - for year in range(start_datetime.year - 1, end_datetime.year + 1): - for q_num in range(0, 4): - rs_profit = bs.query_profit_data(code=symbol, year=year, quarter=q_num + 1) - while (rs_profit.error_code == "0") & rs_profit.next(): - row_data = rs_profit.get_row_data() + df_report["value"] = df_report["value"].apply(lambda r: _str_to_float(r) / 100.0) + df_report["field"] = "roeWa" + + profit_list = [] + for year in range(start_datetime.year - 1, end_datetime.year + 1): + for q_num in range(0, 4): + rs_profit = bs.query_profit_data(code=symbol, year=year, quarter=q_num + 1) + while (rs_profit.error_code == "0") & rs_profit.next(): + row_data = rs_profit.get_row_data() + if "pubDate" in rs_profit.fields: pub_date = pd.Timestamp(row_data[rs_profit.fields.index("pubDate")]) if pub_date >= start_datetime and pub_date <= end_datetime: profit_list.append(row_data) - df_profit = pd.DataFrame(profit_list, columns=rs_profit.fields) - if not df_profit.empty: - df_profit = df_profit[["pubDate", "statDate", "roeAvg"]] - df_profit.rename( - columns={"pubDate": "date", "statDate": "period", "roeAvg": "value"}, - inplace=True, - ) - df_profit["value"] = df_profit["value"].astype("float32") - df_profit["field"] = "roeWa" - - forecast_list = [] - rs_forecast = bs.query_forecast_report( - code=symbol, start_date=str(start_datetime.date()), end_date=str(end_datetime.date()) + df_profit = pd.DataFrame(profit_list, columns=rs_profit.fields) + if {"pubDate", "statDate", "roeAvg"} <= set(rs_profit.fields): + df_profit = df_profit[["pubDate", "statDate", "roeAvg"]] + df_profit.rename( + columns={"pubDate": "date", "statDate": "period", "roeAvg": "value"}, + inplace=True, ) + df_profit["value"] = df_profit["value"].apply(_str_to_float) + df_profit["field"] = "roeWa" + + forecast_list = [] + rs_forecast = bs.query_forecast_report( + code=symbol, start_date=str(start_datetime.date()), end_date=str(end_datetime.date()) + ) - while (rs_forecast.error_code == "0") & rs_forecast.next(): - forecast_list.append(rs_forecast.get_row_data()) - - df_forecast = pd.DataFrame(forecast_list, columns=rs_forecast.fields) - if not df_forecast.empty: - df_forecast = df_forecast[ - [ - "profitForcastExpPubDate", - "profitForcastExpStatDate", - "profitForcastChgPctUp", - "profitForcastChgPctDwn", - ] + while (rs_forecast.error_code == "0") & rs_forecast.next(): + forecast_list.append(rs_forecast.get_row_data()) + + df_forecast = pd.DataFrame(forecast_list, columns=rs_forecast.fields) + if { + "profitForcastExpPubDate", + "profitForcastExpStatDate", + "profitForcastChgPctUp", + "profitForcastChgPctDwn", + } <= set(rs_forecast.fields): + df_forecast = df_forecast[ + [ + "profitForcastExpPubDate", + "profitForcastExpStatDate", + "profitForcastChgPctUp", + "profitForcastChgPctDwn", ] - df_forecast.rename( - columns={ - "profitForcastExpPubDate": "date", - "profitForcastExpStatDate": "period", - }, - inplace=True, - ) - df_forecast["value"] = ( - df_forecast["profitForcastChgPctUp"].astype("float32") - + df_forecast["profitForcastChgPctDwn"].astype("float32") - ) / 200 - df_forecast["field"] = "YOYNI" - df_forecast.drop(["profitForcastChgPctUp", "profitForcastChgPctDwn"], axis=1, inplace=True) - - growth_list = [] - for year in range(start_datetime.year - 1, end_datetime.year + 1): - for q_num in range(0, 4): - rs_growth = bs.query_growth_data(code=symbol, year=year, quarter=q_num + 1) - while (rs_growth.error_code == "0") & rs_growth.next(): - row_data = rs_growth.get_row_data() + ] + df_forecast.rename( + columns={ + "profitForcastExpPubDate": "date", + "profitForcastExpStatDate": "period", + }, + inplace=True, + ) + + df_forecast["profitForcastChgPctUp"] = df_forecast["profitForcastChgPctUp"].apply(_str_to_float) + df_forecast["profitForcastChgPctDwn"] = df_forecast["profitForcastChgPctDwn"].apply(_str_to_float) + df_forecast["value"] = (df_forecast["profitForcastChgPctUp"] + df_forecast["profitForcastChgPctDwn"]) / 200 + df_forecast["field"] = "YOYNI" + df_forecast.drop(["profitForcastChgPctUp", "profitForcastChgPctDwn"], axis=1, inplace=True) + + growth_list = [] + for year in range(start_datetime.year - 1, end_datetime.year + 1): + for q_num in range(0, 4): + rs_growth = bs.query_growth_data(code=symbol, year=year, quarter=q_num + 1) + while (rs_growth.error_code == "0") & rs_growth.next(): + row_data = rs_growth.get_row_data() + if "pubDate" in rs_growth.fields: pub_date = pd.Timestamp(row_data[rs_growth.fields.index("pubDate")]) if pub_date >= start_datetime and pub_date <= end_datetime: growth_list.append(row_data) - df_growth = pd.DataFrame(growth_list, columns=rs_growth.fields)[["pubDate", "statDate", "YOYNI"]] - if not df_growth.empty: - df_growth = df_growth[["pubDate", "statDate", "YOYNI"]] - df_growth.rename( - columns={"pubDate": "date", "statDate": "period", "YOYNI": "value"}, - inplace=True, - ) - df_growth["value"] = df_growth["value"].astype("float32") - df_growth["field"] = "YOYNI" - - df_merge = df_report.append([df_profit, df_forecast, df_growth]) - return df_merge - except Exception as e: - logger.warning(f"{error_msg}:{e}") + + df_growth = pd.DataFrame(growth_list, columns=rs_growth.fields) + if {"pubDate", "statDate", "YOYNI"} <= set(rs_growth.fields): + df_growth = df_growth[["pubDate", "statDate", "YOYNI"]] + df_growth.rename( + columns={"pubDate": "date", "statDate": "period", "YOYNI": "value"}, + inplace=True, + ) + df_growth["value"] = df_growth["value"].apply(_str_to_float) + df_growth["field"] = "YOYNI" + df_merge = df_report.append([df_profit, df_forecast, df_growth]) + + def _process_period(r): + _date = pd.Timestamp(r) + if _date.month == 3 and _date.day == 31: + return "q1" + elif _date.month == 6 and _date.day == 30: + return "q2" + elif _date.month == 9 and _date.day == 30: + return "q3" + elif _date.month == 12 and _date.day == 31: + return "q4" + else: + return "unknown" + + df_merge["period"].apply(_process_period) + return df_merge def get_data( self, symbol: str, interval: str, start_datetime: pd.Timestamp, end_datetime: pd.Timestamp @@ -192,6 +219,7 @@ def get_data( if interval == self.INTERVAL_quarter: _result = self._get_data_from_baostock(symbol, interval, start_datetime, end_datetime) + else: raise ValueError(f"cannot support {interval}") return _result @@ -202,7 +230,7 @@ def min_numbers_trading(self): class Run(BaseRun): - def __init__(self, source_dir=None, max_workers=4, interval="quarterly"): + def __init__(self, source_dir=None, max_workers=1, interval="quarterly"): """ Parameters @@ -228,7 +256,7 @@ def default_base_dir(self) -> [Path, str]: def download_data( self, - max_collector_count=2, + max_collector_count=1, delay=0, start=None, end=None, From 655ff518be368bb65ab1706ba65045bbab7dd0c4 Mon Sep 17 00:00:00 2001 From: bxdd Date: Thu, 8 Apr 2021 15:54:34 +0800 Subject: [PATCH 20/30] fix try --- scripts/data_collector/pit/collector.py | 216 ++++++++++++------------ 1 file changed, 110 insertions(+), 106 deletions(-) diff --git a/scripts/data_collector/pit/collector.py b/scripts/data_collector/pit/collector.py index 6b66711d57..91daece558 100644 --- a/scripts/data_collector/pit/collector.py +++ b/scripts/data_collector/pit/collector.py @@ -93,110 +93,6 @@ def _str_to_float(r): except Exception as e: return np.nan - print(symbol) - symbol = f"{symbol[7:]}.{symbol[:6]}" - rs_report = bs.query_performance_express_report( - code=symbol, start_date=str(start_datetime.date()), end_date=str(end_datetime.date()) - ) - report_list = [] - while (rs_report.error_code == "0") & rs_report.next(): - report_list.append(rs_report.get_row_data()) - # 获取一条记录,将记录合并在一起 - - df_report = pd.DataFrame(report_list, columns=rs_report.fields) - if {"performanceExpPubDate", "performanceExpStatDate", "performanceExpressROEWa"} <= set(rs_report.fields): - df_report = df_report[["performanceExpPubDate", "performanceExpStatDate", "performanceExpressROEWa"]] - df_report.rename( - columns={ - "performanceExpPubDate": "date", - "performanceExpStatDate": "period", - "performanceExpressROEWa": "value", - }, - inplace=True, - ) - df_report["value"] = df_report["value"].apply(lambda r: _str_to_float(r) / 100.0) - df_report["field"] = "roeWa" - - profit_list = [] - for year in range(start_datetime.year - 1, end_datetime.year + 1): - for q_num in range(0, 4): - rs_profit = bs.query_profit_data(code=symbol, year=year, quarter=q_num + 1) - while (rs_profit.error_code == "0") & rs_profit.next(): - row_data = rs_profit.get_row_data() - if "pubDate" in rs_profit.fields: - pub_date = pd.Timestamp(row_data[rs_profit.fields.index("pubDate")]) - if pub_date >= start_datetime and pub_date <= end_datetime: - profit_list.append(row_data) - - df_profit = pd.DataFrame(profit_list, columns=rs_profit.fields) - if {"pubDate", "statDate", "roeAvg"} <= set(rs_profit.fields): - df_profit = df_profit[["pubDate", "statDate", "roeAvg"]] - df_profit.rename( - columns={"pubDate": "date", "statDate": "period", "roeAvg": "value"}, - inplace=True, - ) - df_profit["value"] = df_profit["value"].apply(_str_to_float) - df_profit["field"] = "roeWa" - - forecast_list = [] - rs_forecast = bs.query_forecast_report( - code=symbol, start_date=str(start_datetime.date()), end_date=str(end_datetime.date()) - ) - - while (rs_forecast.error_code == "0") & rs_forecast.next(): - forecast_list.append(rs_forecast.get_row_data()) - - df_forecast = pd.DataFrame(forecast_list, columns=rs_forecast.fields) - if { - "profitForcastExpPubDate", - "profitForcastExpStatDate", - "profitForcastChgPctUp", - "profitForcastChgPctDwn", - } <= set(rs_forecast.fields): - df_forecast = df_forecast[ - [ - "profitForcastExpPubDate", - "profitForcastExpStatDate", - "profitForcastChgPctUp", - "profitForcastChgPctDwn", - ] - ] - df_forecast.rename( - columns={ - "profitForcastExpPubDate": "date", - "profitForcastExpStatDate": "period", - }, - inplace=True, - ) - - df_forecast["profitForcastChgPctUp"] = df_forecast["profitForcastChgPctUp"].apply(_str_to_float) - df_forecast["profitForcastChgPctDwn"] = df_forecast["profitForcastChgPctDwn"].apply(_str_to_float) - df_forecast["value"] = (df_forecast["profitForcastChgPctUp"] + df_forecast["profitForcastChgPctDwn"]) / 200 - df_forecast["field"] = "YOYNI" - df_forecast.drop(["profitForcastChgPctUp", "profitForcastChgPctDwn"], axis=1, inplace=True) - - growth_list = [] - for year in range(start_datetime.year - 1, end_datetime.year + 1): - for q_num in range(0, 4): - rs_growth = bs.query_growth_data(code=symbol, year=year, quarter=q_num + 1) - while (rs_growth.error_code == "0") & rs_growth.next(): - row_data = rs_growth.get_row_data() - if "pubDate" in rs_growth.fields: - pub_date = pd.Timestamp(row_data[rs_growth.fields.index("pubDate")]) - if pub_date >= start_datetime and pub_date <= end_datetime: - growth_list.append(row_data) - - df_growth = pd.DataFrame(growth_list, columns=rs_growth.fields) - if {"pubDate", "statDate", "YOYNI"} <= set(rs_growth.fields): - df_growth = df_growth[["pubDate", "statDate", "YOYNI"]] - df_growth.rename( - columns={"pubDate": "date", "statDate": "period", "YOYNI": "value"}, - inplace=True, - ) - df_growth["value"] = df_growth["value"].apply(_str_to_float) - df_growth["field"] = "YOYNI" - df_merge = df_report.append([df_profit, df_forecast, df_growth]) - def _process_period(r): _date = pd.Timestamp(r) if _date.month == 3 and _date.day == 31: @@ -210,8 +106,116 @@ def _process_period(r): else: return "unknown" - df_merge["period"].apply(_process_period) - return df_merge + try: + symbol = f"{symbol[7:]}.{symbol[:6]}" + rs_report = bs.query_performance_express_report( + code=symbol, start_date=str(start_datetime.date()), end_date=str(end_datetime.date()) + ) + report_list = [] + while (rs_report.error_code == "0") & rs_report.next(): + report_list.append(rs_report.get_row_data()) + # 获取一条记录,将记录合并在一起 + + df_report = pd.DataFrame(report_list, columns=rs_report.fields) + if {"performanceExpPubDate", "performanceExpStatDate", "performanceExpressROEWa"} <= set(rs_report.fields): + df_report = df_report[["performanceExpPubDate", "performanceExpStatDate", "performanceExpressROEWa"]] + df_report.rename( + columns={ + "performanceExpPubDate": "date", + "performanceExpStatDate": "period", + "performanceExpressROEWa": "value", + }, + inplace=True, + ) + df_report["value"] = df_report["value"].apply(lambda r: _str_to_float(r) / 100.0) + df_report["field"] = "roeWa" + + profit_list = [] + for year in range(start_datetime.year - 1, end_datetime.year + 1): + for q_num in range(0, 4): + rs_profit = bs.query_profit_data(code=symbol, year=year, quarter=q_num + 1) + while (rs_profit.error_code == "0") & rs_profit.next(): + row_data = rs_profit.get_row_data() + if "pubDate" in rs_profit.fields: + pub_date = pd.Timestamp(row_data[rs_profit.fields.index("pubDate")]) + if pub_date >= start_datetime and pub_date <= end_datetime: + profit_list.append(row_data) + + df_profit = pd.DataFrame(profit_list, columns=rs_profit.fields) + if {"pubDate", "statDate", "roeAvg"} <= set(rs_profit.fields): + df_profit = df_profit[["pubDate", "statDate", "roeAvg"]] + df_profit.rename( + columns={"pubDate": "date", "statDate": "period", "roeAvg": "value"}, + inplace=True, + ) + df_profit["value"] = df_profit["value"].apply(_str_to_float) + df_profit["field"] = "roeWa" + + forecast_list = [] + rs_forecast = bs.query_forecast_report( + code=symbol, start_date=str(start_datetime.date()), end_date=str(end_datetime.date()) + ) + + while (rs_forecast.error_code == "0") & rs_forecast.next(): + forecast_list.append(rs_forecast.get_row_data()) + + df_forecast = pd.DataFrame(forecast_list, columns=rs_forecast.fields) + if { + "profitForcastExpPubDate", + "profitForcastExpStatDate", + "profitForcastChgPctUp", + "profitForcastChgPctDwn", + } <= set(rs_forecast.fields): + df_forecast = df_forecast[ + [ + "profitForcastExpPubDate", + "profitForcastExpStatDate", + "profitForcastChgPctUp", + "profitForcastChgPctDwn", + ] + ] + df_forecast.rename( + columns={ + "profitForcastExpPubDate": "date", + "profitForcastExpStatDate": "period", + }, + inplace=True, + ) + + df_forecast["profitForcastChgPctUp"] = df_forecast["profitForcastChgPctUp"].apply(_str_to_float) + df_forecast["profitForcastChgPctDwn"] = df_forecast["profitForcastChgPctDwn"].apply(_str_to_float) + df_forecast["value"] = ( + df_forecast["profitForcastChgPctUp"] + df_forecast["profitForcastChgPctDwn"] + ) / 200 + df_forecast["field"] = "YOYNI" + df_forecast.drop(["profitForcastChgPctUp", "profitForcastChgPctDwn"], axis=1, inplace=True) + + growth_list = [] + for year in range(start_datetime.year - 1, end_datetime.year + 1): + for q_num in range(0, 4): + rs_growth = bs.query_growth_data(code=symbol, year=year, quarter=q_num + 1) + while (rs_growth.error_code == "0") & rs_growth.next(): + row_data = rs_growth.get_row_data() + if "pubDate" in rs_growth.fields: + pub_date = pd.Timestamp(row_data[rs_growth.fields.index("pubDate")]) + if pub_date >= start_datetime and pub_date <= end_datetime: + growth_list.append(row_data) + + df_growth = pd.DataFrame(growth_list, columns=rs_growth.fields) + if {"pubDate", "statDate", "YOYNI"} <= set(rs_growth.fields): + df_growth = df_growth[["pubDate", "statDate", "YOYNI"]] + df_growth.rename( + columns={"pubDate": "date", "statDate": "period", "YOYNI": "value"}, + inplace=True, + ) + df_growth["value"] = df_growth["value"].apply(_str_to_float) + df_growth["field"] = "YOYNI" + df_merge = df_report.append([df_profit, df_forecast, df_growth]) + + df_merge["period"].apply(_process_period) + return df_merge + except Exception as e: + logger.warning(f"{error_msg}:{e}") def get_data( self, symbol: str, interval: str, start_datetime: pd.Timestamp, end_datetime: pd.Timestamp From f6ca4d2cc12f5ee3d03857a8e56c28192b740256 Mon Sep 17 00:00:00 2001 From: bxdd Date: Fri, 9 Apr 2021 23:58:19 +0800 Subject: [PATCH 21/30] fix bug & add dump_pit.py --- qlib/data/data.py | 35 ++- qlib/utils/__init__.py | 33 ++- scripts/data_collector/pit/README.md | 9 +- scripts/data_collector/pit/collector.py | 45 ++-- scripts/dump_pit.py | 277 ++++++++++++++++++++++++ tests/notest_PIT.py | 23 +- 6 files changed, 363 insertions(+), 59 deletions(-) create mode 100644 scripts/dump_pit.py diff --git a/qlib/data/data.py b/qlib/data/data.py index 94f85d067a..67eaacb4e6 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -20,7 +20,15 @@ from .cache import H from ..config import C from ..log import get_module_logger -from ..utils import parse_field, read_bin, read_period_data, hash_args, normalize_cache_fields, code_to_fname +from ..utils import ( + parse_field, + read_bin, + read_period_data, + get_period_list, + hash_args, + normalize_cache_fields, + code_to_fname, +) from .base import Feature, PFeature, Operators from .cache import DiskDatasetCache, DiskExpressionCache from ..utils import Wrapper, init_instance_by_config, register_wrapper, get_module_by_module_path @@ -661,7 +669,7 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_date, ("value", C.pit_record_type["value"]), ("_next", C.pit_record_type["index"]), ] - VALUE_TYPE = C.pit_record_type["value"] + VALUE_DTYPE = C.pit_record_type["value"] field = str(field).lower()[2:] instrument = code_to_fname(instrument) @@ -676,9 +684,9 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_date, if field not in self.period_index: self.period_index[field] = {} - if not field.startswith("q_") and not field.startswith("a_"): - raise ValueError("period field must start with 'q_' or 'a_'") - quarterly = field.startswith("q_") + if not field.endswith("_q") and not field.endswith("_a"): + raise ValueError("period field must ends with '_q' or '_q'") + quarterly = field.endswith("_q") index_path = self.uri_period_index.format(instrument.lower(), field) data_path = self.uri_period_data.format(instrument.lower(), field) data = np.fromfile(data_path, dtype=DATA_RECORDS) @@ -691,26 +699,15 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_date, last_period = data["period"][loc - start_offset - 1 : loc - end_offset].max() # return the latest quarter first_period = data["period"][loc - start_offset - 1 : loc - end_offset].min() - if not quarterly: - assert all(1900 <= x <= 2099 for x in (first_period, last_period)), "invalid arguments" - period_list = list(range(first_period, last_period + 1)) - else: - assert all(190000 <= x <= 209904 for x in (first_period, last_period)), "invalid arguments" - period_list = [] - for year in range(first_period // 100, last_period // 100 + 1): - for q in range(1, 5): - period = year * 100 + q - if first_period <= period <= last_period: - period_list.append(year * 100 + q) - - value = np.empty(len(period_list), dtype=VALUE_TYPE) + period_list = get_period_list(first_period, last_period, quarterly) + value = np.empty(len(period_list), dtype=VALUE_DTYPE) for i, period in enumerate(period_list): last_period_index = self.period_index[field].get(period) value[i], now_period_index = read_period_data( index_path, data_path, period, cur_date, quarterly, last_period_index ) self.period_index[field].update({period: now_period_index}) - series = pd.Series(value, index=period_list, dtype=VALUE_TYPE) + series = pd.Series(value, index=period_list, dtype=VALUE_DTYPE) if cur_index == end_index: self.all_fields.remove(field) diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 3ed3c0537a..f82809ed54 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -56,6 +56,29 @@ def read_bin(file_path, start_index, end_index): return series +def get_period_list(first, last, quarterly): + if not quarterly: + assert all(1900 <= x <= 2099 for x in (first, last)), "invalid arguments" + return list(range(first, last + 1)) + else: + assert all(190000 <= x <= 209904 for x in (first, last)), "invalid arguments" + res = [] + for year in range(first // 100, last // 100 + 1): + for q in range(1, 5): + period = year * 100 + q + if first <= period <= last: + res.append(year * 100 + q) + return res + + +def get_period_offset(first_year, period, quarterly): + if quarterly: + offset = (period // 100 - first_year) * 4 + period % 100 - 1 + else: + offset = period - first_year + return offset + + def read_period_data(index_path, data_path, period, cur_date, quarterly, last_period_index): DATA_DTYPE = "".join( @@ -67,8 +90,8 @@ def read_period_data(index_path, data_path, period, cur_date, quarterly, last_pe ] ) - PERIOD_TYPE = C.pit_record_type["period"] - INDEX_TYPE = C.pit_record_type["index"] + PERIOD_DTYPE = C.pit_record_type["period"] + INDEX_DTYPE = C.pit_record_type["index"] NAN_VALUE = C.pit_record_nan["value"] NAN_INDEX = C.pit_record_nan["index"] @@ -76,9 +99,9 @@ def read_period_data(index_path, data_path, period, cur_date, quarterly, last_pe # find the first index of linked revisions if last_period_index is None: with open(index_path, "rb") as fi: - (first_year,) = struct.unpack(PERIOD_TYPE, fi.read(struct.calcsize(PERIOD_TYPE))) - all_periods = np.fromfile(fi, dtype=INDEX_TYPE) - offset = (period // 100 - first_year) * 4 + period % 100 - 1 if quarterly else period - first_year + (first_year,) = struct.unpack(PERIOD_DTYPE, fi.read(struct.calcsize(PERIOD_DTYPE))) + all_periods = np.fromfile(fi, dtype=INDEX_DTYPE) + offset = get_period_offset(first_year, period, quarterly) _next = all_periods[offset] else: _next = last_period_index diff --git a/scripts/data_collector/pit/README.md b/scripts/data_collector/pit/README.md index 6936c983ce..613e5cd704 100644 --- a/scripts/data_collector/pit/README.md +++ b/scripts/data_collector/pit/README.md @@ -13,11 +13,14 @@ pip install -r requirements.txt ### Download Quarterly CN Data -#### 1d from East Money - ```bash # download from baostock.com -python collector.py download_data --source_dir ~/.qlib/cn_data/source/pit_quarter --start 2010-01-01 --end 2020-01-01 --interval quarterly +python collector.py download_data --source_dir /data1/v-xiabi/qlib/pit/csv_2 --start 2000-01-01 --end 2020-01-01 --interval quarterly ``` + +### Dump Data into PIT Format + +cd qlib/scripts +python dump_pit.py dump --csv_path /data1/v-xiabi/qlib/pit/csv_2 --qlib_dir ~/.qlib/qlib_data/cn_data --interval quarterly \ No newline at end of file diff --git a/scripts/data_collector/pit/collector.py b/scripts/data_collector/pit/collector.py index 91daece558..6ff7280b42 100644 --- a/scripts/data_collector/pit/collector.py +++ b/scripts/data_collector/pit/collector.py @@ -26,7 +26,7 @@ class PitCollector(BaseCollector): DEFAULT_END_DATETIME_QUARTER = pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1)) DEFAULT_END_DATETIME_ANNUAL = pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1)) - INTERVAL_quarter = "quarterly" + INTERVAL_quarterly = "quarterly" INTERVAL_annual = "annual" def __init__( @@ -70,6 +70,7 @@ def __init__( max_workers=max_workers, max_collector_count=max_collector_count, delay=delay, + check_data_length=check_data_length, limit_nums=limit_nums, ) @@ -93,19 +94,6 @@ def _str_to_float(r): except Exception as e: return np.nan - def _process_period(r): - _date = pd.Timestamp(r) - if _date.month == 3 and _date.day == 31: - return "q1" - elif _date.month == 6 and _date.day == 30: - return "q2" - elif _date.month == 9 and _date.day == 30: - return "q3" - elif _date.month == 12 and _date.day == 31: - return "q4" - else: - return "unknown" - try: symbol = f"{symbol[7:]}.{symbol[:6]}" rs_report = bs.query_performance_express_report( @@ -212,21 +200,42 @@ def _process_period(r): df_growth["field"] = "YOYNI" df_merge = df_report.append([df_profit, df_forecast, df_growth]) - df_merge["period"].apply(_process_period) return df_merge except Exception as e: logger.warning(f"{error_msg}:{e}") + def _process_data(self, df, symbol, interval): + error_msg = f"{symbol}-{interval}" + + def _process_period(r): + _date = pd.Timestamp(r) + return _date.year if interval == self.INTERVAL_annual else _date.year * 100 + (_date.month - 1) // 3 + 1 + + try: + _date = df["period"].apply( + lambda x: ( + pd.to_datetime(x) + pd.DateOffset(days=(45 if interval == self.INTERVAL_quarterly else 90)) + ).date() + ) + df["date"] = df["date"].fillna(_date.astype(str)) + df["period"] = df["period"].apply(_process_period) + return df + except Exception as e: + logger.warning(f"{error_msg}:{e}") + def get_data( self, symbol: str, interval: str, start_datetime: pd.Timestamp, end_datetime: pd.Timestamp ) -> [pd.DataFrame]: - if interval == self.INTERVAL_quarter: + if interval == self.INTERVAL_quarterly: _result = self._get_data_from_baostock(symbol, interval, start_datetime, end_datetime) - + if _result is None or _result.empty: + return _result + else: + return self._process_data(_result, symbol, interval) else: raise ValueError(f"cannot support {interval}") - return _result + return self._process_data(_result, interval) @property def min_numbers_trading(self): diff --git a/scripts/dump_pit.py b/scripts/dump_pit.py new file mode 100644 index 0000000000..9bff2f0f2f --- /dev/null +++ b/scripts/dump_pit.py @@ -0,0 +1,277 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import abc +import shutil +import struct +import traceback +from pathlib import Path +from typing import Iterable, List, Union +from functools import partial +from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor + +import fire +import numpy as np +import pandas as pd +from tqdm import tqdm +from loguru import logger +from qlib.utils import fname_to_code, code_to_fname, get_period_offset +from qlib.config import C + + +class DumpPitData: + PIT_DIR_NAME = "financial" + PIT_CSV_SEP = "," + DATA_FILE_SUFFIX = ".data" + INDEX_FILE_SUFFIX = ".index" + + INTERVAL_quarterly = "quarterly" + INTERVAL_annual = "annual" + + PERIOD_DTYPE = C.pit_record_type["period"] + INDEX_DTYPE = C.pit_record_type["index"] + DATA_DTYPE = "".join( + [ + C.pit_record_type["date"], + C.pit_record_type["period"], + C.pit_record_type["value"], + C.pit_record_type["index"], + ] + ) + + NA_INDEX = C.pit_record_nan["index"] + + INDEX_DTYPE_SIZE = struct.calcsize(INDEX_DTYPE) + PERIOD_DTYPE_SIZE = struct.calcsize(PERIOD_DTYPE) + DATA_DTYPE_SIZE = struct.calcsize(DATA_DTYPE) + + UPDATE_MODE = "update" + ALL_MODE = "all" + + def __init__( + self, + csv_path: str, + qlib_dir: str, + backup_dir: str = None, + freq: str = "quarterly", + max_workers: int = 16, + date_column_name: str = "date", + period_column_name: str = "period", + value_column_name: str = "value", + field_column_name: str = "field", + file_suffix: str = ".csv", + exclude_fields: str = "", + include_fields: str = "", + limit_nums: int = None, + ): + """ + + Parameters + ---------- + csv_path: str + stock data path or directory + qlib_dir: str + qlib(dump) data director + backup_dir: str, default None + if backup_dir is not None, backup qlib_dir to backup_dir + freq: str, default "quarterly" + data frequency + max_workers: int, default None + number of threads + date_column_name: str, default "date" + the name of the date field in the csv + file_suffix: str, default ".csv" + file suffix + include_fields: tuple + dump fields + exclude_fields: tuple + fields not dumped + limit_nums: int + Use when debugging, default None + """ + csv_path = Path(csv_path).expanduser() + if isinstance(exclude_fields, str): + exclude_fields = exclude_fields.split(",") + if isinstance(include_fields, str): + include_fields = include_fields.split(",") + self._exclude_fields = tuple(filter(lambda x: len(x) > 0, map(str.strip, exclude_fields))) + self._include_fields = tuple(filter(lambda x: len(x) > 0, map(str.strip, include_fields))) + self.file_suffix = file_suffix + self.csv_files = sorted(csv_path.glob(f"*{self.file_suffix}") if csv_path.is_dir() else [csv_path]) + if limit_nums is not None: + self.csv_files = self.csv_files[: int(limit_nums)] + self.qlib_dir = Path(qlib_dir).expanduser() + self.backup_dir = backup_dir if backup_dir is None else Path(backup_dir).expanduser() + if backup_dir is not None: + self._backup_qlib_dir(Path(backup_dir).expanduser()) + + self.works = max_workers + self.date_column_name = date_column_name + self.period_column_name = period_column_name + self.value_column_name = value_column_name + self.field_column_name = field_column_name + + self._mode = self.ALL_MODE + + def _backup_qlib_dir(self, target_dir: Path): + shutil.copytree(str(self.qlib_dir.resolve()), str(target_dir.resolve())) + + def get_source_data(self, file_path: Path) -> pd.DataFrame: + df = pd.read_csv(str(file_path.resolve()), low_memory=False) + df[self.value_column_name] = df[self.value_column_name].astype("float32") + df[self.date_column_name] = df[self.date_column_name].str.replace("-", "").astype("int32") + # df.drop_duplicates([self.date_field_name], inplace=True) + return df + + def get_symbol_from_file(self, file_path: Path) -> str: + return fname_to_code(file_path.name[: -len(self.file_suffix)].strip().lower()) + + def get_dump_fields(self, df: Iterable[str]) -> Iterable[str]: + return ( + set(self._include_fields) + if self._include_fields + else set(df[self.field_column_name]) - set(self._exclude_fields) + if self._exclude_fields + else set(df[self.field_column_name]) + ) + + def get_filenames(self, symbol, field, interval): + dir_name = self.qlib_dir.joinpath(self.PIT_DIR_NAME, symbol) + dir_name.mkdir(parents=True, exist_ok=True) + return ( + dir_name.joinpath(f"{field}_{interval[0]}{self.DATA_FILE_SUFFIX}".lower()), + dir_name.joinpath(f"{field}_{interval[0]}{self.INDEX_FILE_SUFFIX}".lower()), + ) + + def _dump_pit( + self, + file_path: str, + interval: str = "quarterly", + overwrite: bool = False, + ): + """ + dump data as the following format: + `/path/to/.data` + [date, period, value, _next] + [date, period, value, _next] + [...] + `/path/to/.index` + [first_year, index, index, ...] + + `` contains the data as the point-in-time (PIT) order: `value` of `period` + is published at `date`, and its successive revised value can be found at `_next` (linked list). + + `.index` contains the index of value for each period (quarter or year). To save + disk space, we only store the `first_year` as its followings periods can be easily infered. + + Parameters + ---------- + symbol: str + stock symbol + interval: str + data interval + overwrite: bool + whether overwrite existing data or update only + """ + symbol = self.get_symbol_from_file(file_path) + df = self.get_source_data(file_path) + if df.empty: + logger.warning(f"{symbol} file is empty") + return + for field in self.get_dump_fields(df): + df_sub = df.query(f'{self.field_column_name}=="{field}"').sort_values(self.date_column_name) + if df_sub.empty: + logger.warning(f"field {field} of {symbol} is empty") + continue + data_file, index_file = self.get_filenames(symbol, field, interval) + + ## calculate first & last period + start_year = df_sub[self.period_column_name].min() + end_year = df_sub[self.period_column_name].max() + if interval == self.INTERVAL_quarterly: + start_year //= 100 + end_year //= 100 + + # adjust `first_year` if existing data found + if not overwrite and index_file.exists(): + with open(index_file, "rb") as fi: + (first_year,) = struct.unpack(self.PERIOD_DTYPE, fi.read(self.PERIOD_DTYPE_SIZE)) + n_years = len(fi.read()) // self.INDEX_DTYPE_SIZE + if interval == self.INTERVAL_quarterly: + n_years //= 4 + start_year = first_year + n_years + else: + with open(index_file, "wb") as f: + f.write(struct.pack(self.PERIOD_DTYPE, start_year)) + first_year = start_year + + # if data already exists, continue to the next field + if start_year > end_year: + logger.warning(f"{symbol}-{field} data already exists, continue to the next field") + continue + + # dump index filled with NA + with open(index_file, "ab") as fi: + for year in range(start_year, end_year + 1): + if interval == self.INTERVAL_quarterly: + fi.write(struct.pack(self.INDEX_DTYPE * 4, *[self.NA_INDEX] * 4)) + else: + fi.write(struct.pack(self.INDEX_DTYPE, self.NA_INDEX)) + + # if data already exists, remove overlapped data + if not overwrite and data_file.exists(): + with open(data_file, "rb") as fd: + fd.seek(-self.DATA_DTYPE_SIZE, 2) + last_date, _, _, _ = struct.unpack(self.DATA_DTYPE, fd.read()) + df_sub = df_sub.query(f"{self.date_column_name}>{last_date}") + # otherwise, + # 1) truncate existing file or create a new file with `wb+` if overwrite, + # 2) or append existing file or create a new file with `ab+` if not overwrite + else: + with open(data_file, "wb+" if overwrite else "ab+"): + pass + + with open(data_file, "rb+") as fd, open(index_file, "rb+") as fi: + + # update index if needed + for i, row in df_sub.iterrows(): + # get index + offset = get_period_offset(first_year, row.period, interval == self.INTERVAL_quarterly) + + fi.seek(self.PERIOD_DTYPE_SIZE + self.INDEX_DTYPE_SIZE * offset) + (cur_index,) = struct.unpack(self.INDEX_DTYPE, fi.read(self.INDEX_DTYPE_SIZE)) + + # Case I: new data => update `_next` with current index + if cur_index == self.NA_INDEX: + fi.seek(self.PERIOD_DTYPE_SIZE + self.INDEX_DTYPE_SIZE * offset) + fi.write(struct.pack(self.INDEX_DTYPE, fd.tell())) + # Case II: previous data exists => find and update the last `_next` + else: + _cur_fd = fd.tell() + prev_index = self.NA_INDEX + while cur_index != self.NA_INDEX: # NOTE: first iter always != NA_INDEX + fd.seek(cur_index + self.DATA_DTYPE_SIZE - self.INDEX_DTYPE_SIZE) + prev_index = cur_index + (cur_index,) = struct.unpack(self.INDEX_DTYPE, fd.read(self.INDEX_DTYPE_SIZE)) + fd.seek(prev_index + self.DATA_DTYPE_SIZE - self.INDEX_DTYPE_SIZE) + fd.write(struct.pack(self.INDEX_DTYPE, _cur_fd)) # NOTE: add _next pointer + fd.seek(_cur_fd) + + # dump data + fd.write(struct.pack(self.DATA_DTYPE, row.date, row.period, row.value, self.NA_INDEX)) + + def dump(self, interval="quarterly", overwrite=False): + logger.info("start dump pit data......") + _dump_func = partial(self._dump_pit, interval=interval, overwrite=overwrite) + + with tqdm(total=len(self.csv_files)) as p_bar: + with ProcessPoolExecutor(max_workers=self.works) as executor: + for _ in executor.map(_dump_func, self.csv_files): + p_bar.update() + + def __call__(self, *args, **kwargs): + self.dump() + + +if __name__ == "__main__": + fire.Fire(DumpPitData) diff --git a/tests/notest_PIT.py b/tests/notest_PIT.py index a6fbcc353f..cb6e02313e 100644 --- a/tests/notest_PIT.py +++ b/tests/notest_PIT.py @@ -6,29 +6,24 @@ import qlib from qlib.data import D from qlib.tests import TestAutoData -from qlib.config import REG_US +from qlib.config import REG_CN class TestRegiterCustomOps(TestAutoData): @classmethod def setUpClass(cls) -> None: # use default data - provider_uri = "~/.qlib/qlib_data/us_data" # target_dir - qlib.init(provider_uri=provider_uri, region=REG_US) + provider_uri = "~/.qlib/qlib_data/cn_data_new" # target_dir + qlib.init(provider_uri=provider_uri, region=REG_CN) def test_regiter_custom_ops(self): - instruments = ["a1x4w7"] - fields = ["$$q_accountspayable / $$q_totalcurrentassets"] - print(D.features(instruments, fields, start_time="2020-06-01", end_time="2020-06-10", freq="day")) - fields = [ - "($$q_accountspayable / $$q_totalcurrentassets) / PRef($$q_accountspayable / $$q_totalcurrentassets, 1) - 1" - ] - print(D.features(instruments, fields, start_time="2020-06-01", end_time="2020-06-10", freq="day")) - fields = ["PSum($$q_totalrevenue/$$q_totalcurrentassets, 4)"] - print(D.features(instruments, fields, start_time="2020-06-01", end_time="2020-06-10", freq="day")) - fields = ["$$q_totalcurrentassets/$close/10000"] - print(D.features(instruments, fields, start_time="2020-06-01", end_time="2020-06-10", freq="day")) + instruments = ["sz000708"] + fields = ["$$roewa_q", "$$yoyni_q"] + fields += ["($$roewa_q / $$yoyni_q) / PRef($$roewa_q / $$yoyni_q, 1) - 1"] + fields += ["PSum($$yoyni_q, 4)"] + fields += ["$close", "$$roewa_q*$close"] + print(D.features(instruments, fields, start_time="2019-01-01", end_time="2020-01-01", freq="day")) if __name__ == "__main__": From 566a8f9a00085d165bd13d50d315108029d8ceb7 Mon Sep 17 00:00:00 2001 From: Young Date: Fri, 4 Mar 2022 16:47:09 +0800 Subject: [PATCH 22/30] Successfully run and understand PIT --- docs/advanced/PIT.rst | 108 ++++++++++++++++++++++++ qlib/data/base.py | 3 + qlib/data/data.py | 5 +- qlib/utils/__init__.py | 29 ++++++- scripts/data_collector/base.py | 2 + scripts/data_collector/pit/README.md | 15 +++- scripts/data_collector/pit/collector.py | 24 +++++- scripts/data_collector/pit/test_pit.py | 49 +++++++++++ setup.py | 8 ++ tests/notest_PIT.py | 10 +-- 10 files changed, 237 insertions(+), 16 deletions(-) create mode 100644 docs/advanced/PIT.rst create mode 100644 scripts/data_collector/pit/test_pit.py diff --git a/docs/advanced/PIT.rst b/docs/advanced/PIT.rst new file mode 100644 index 0000000000..c5d441d1b5 --- /dev/null +++ b/docs/advanced/PIT.rst @@ -0,0 +1,108 @@ +.. _alpha: + +=========================== +(P)oint-(I)n-(T)ime Database +=========================== +.. currentmodule:: qlib + + +For each feature, it contains 4 columns, i.e. date, period, value, _next. +Each row corresponds to a statement. + +The meaning of each feature with filename like `XXX_a.data` +- `date`: the statement's date of publication. +- `period`: the period of the statement. (e.g. it will be quarterly frequency in most of the markets) + - If it is an annual period, it will be an integer corresponding to the year + - If it is an quarterly periods, it will be an integer like ``. The last two decimal digits represents the index of quarter. Others represent the year. +- `value`: the described value +- `_next`: the byte index of the next occurance of the field. + +Besides the feature, a index `XXX_a.index` + +The statements are soted by the `date` in ascending order from the beginning of the file. + +.. code-block:: python + + # the data format from XXXX.data + array([(20070428, 200701, 0.090219 , 4294967295), + (20070817, 200702, 0.13933 , 4294967295), + (20071023, 200703, 0.24586301, 4294967295), + (20080301, 200704, 0.3479 , 80), + (20080313, 200704, 0.395989 , 4294967295), + (20080422, 200801, 0.100724 , 4294967295), + (20080828, 200802, 0.24996801, 4294967295), + (20081027, 200803, 0.33412001, 4294967295), + (20090325, 200804, 0.39011699, 4294967295), + (20090421, 200901, 0.102675 , 4294967295), + (20090807, 200902, 0.230712 , 4294967295), + (20091024, 200903, 0.30072999, 4294967295), + (20100402, 200904, 0.33546099, 4294967295), + (20100426, 201001, 0.083825 , 4294967295), + (20100812, 201002, 0.200545 , 4294967295), + (20101029, 201003, 0.260986 , 4294967295), + (20110321, 201004, 0.30739301, 4294967295), + (20110423, 201101, 0.097411 , 4294967295), + (20110831, 201102, 0.24825101, 4294967295), + (20111018, 201103, 0.318919 , 4294967295), + (20120323, 201104, 0.4039 , 420), + (20120411, 201104, 0.403925 , 4294967295), + (20120426, 201201, 0.112148 , 4294967295), + (20120810, 201202, 0.26484701, 4294967295), + (20121026, 201203, 0.370487 , 4294967295), + (20130329, 201204, 0.45004699, 4294967295), + (20130418, 201301, 0.099958 , 4294967295), + (20130831, 201302, 0.21044201, 4294967295), + (20131016, 201303, 0.30454299, 4294967295), + (20140325, 201304, 0.394328 , 4294967295), + (20140425, 201401, 0.083217 , 4294967295), + (20140829, 201402, 0.16450299, 4294967295), + (20141030, 201403, 0.23408499, 4294967295), + (20150421, 201404, 0.319612 , 4294967295), + (20150421, 201501, 0.078494 , 4294967295), + (20150828, 201502, 0.137504 , 4294967295), + (20151023, 201503, 0.201709 , 4294967295), + (20160324, 201504, 0.26420501, 4294967295), + (20160421, 201601, 0.073664 , 4294967295), + (20160827, 201602, 0.136576 , 4294967295), + (20161029, 201603, 0.188062 , 4294967295), + (20170415, 201604, 0.244385 , 4294967295), + (20170425, 201701, 0.080614 , 4294967295), + (20170728, 201702, 0.15151 , 4294967295), + (20171026, 201703, 0.25416601, 4294967295), + (20180328, 201704, 0.32954201, 4294967295), + (20180428, 201801, 0.088887 , 4294967295), + (20180802, 201802, 0.170563 , 4294967295), + (20181029, 201803, 0.25522 , 4294967295), + (20190329, 201804, 0.34464401, 4294967295), + (20190425, 201901, 0.094737 , 4294967295), + (20190713, 201902, 0. , 1040), + (20190718, 201902, 0.175322 , 4294967295), + (20191016, 201903, 0.25581899, 4294967295)], + dtype=[('date', ' List[int]: + """ + This method will be used in PIT database. + It return all the possible values between `first` and `end` (first and end is included) + + Parameters + ---------- + quarterly : bool + will it return quarterly index or yearly index. + + Returns + ------- + List[int] + the possible index between [first, last] + """ + + if not quarterly: assert all(1900 <= x <= 2099 for x in (first, last)), "invalid arguments" return list(range(first, last + 1)) @@ -80,7 +96,14 @@ def get_period_offset(first_year, period, quarterly): def read_period_data(index_path, data_path, period, cur_date, quarterly, last_period_index): + """ + At `cur_date`(e.g. 20190102), read the information at `period`(e.g. 201803). + Only the updating info before cur_date or at cur_date will be used. + Returns + ------- + the query value and byte index the index value + """ DATA_DTYPE = "".join( [ C.pit_record_type["date"], @@ -114,7 +137,7 @@ def read_period_data(index_path, data_path, period, cur_date, quarterly, last_pe while _next != NAN_INDEX: fd.seek(_next) date, period, value, new_next = struct.unpack(DATA_DTYPE, fd.read(struct.calcsize(DATA_DTYPE))) - if date >= cur_date: # NOTE: only use after published date + if date > cur_date: break prev_next = _next _next = new_next diff --git a/scripts/data_collector/base.py b/scripts/data_collector/base.py index f33f1d95f2..92d47791ab 100644 --- a/scripts/data_collector/base.py +++ b/scripts/data_collector/base.py @@ -362,6 +362,7 @@ def download_data( interval="1d", check_data_length=False, limit_nums=None, + **kwargs, ): """download data from Internet @@ -401,6 +402,7 @@ def download_data( interval=interval, check_data_length=check_data_length, limit_nums=limit_nums, + **kwargs ).collector_data() def normalize_data(self, date_field_name: str = "date", symbol_field_name: str = "symbol"): diff --git a/scripts/data_collector/pit/README.md b/scripts/data_collector/pit/README.md index 613e5cd704..e18dcd0c17 100644 --- a/scripts/data_collector/pit/README.md +++ b/scripts/data_collector/pit/README.md @@ -14,13 +14,22 @@ pip install -r requirements.txt ### Download Quarterly CN Data ```bash - +cd qlib/scripts/data_collector/pit/ # download from baostock.com -python collector.py download_data --source_dir /data1/v-xiabi/qlib/pit/csv_2 --start 2000-01-01 --end 2020-01-01 --interval quarterly +python collector.py download_data --source_dir ./csv_pit --start 2000-01-01 --end 2020-01-01 --interval quarterly +``` +Downloading all data from the stock is very time consuming. If you just want run a quick test on a few stocks, you can run the command below +``` bash +python collector.py download_data --source_dir ./csv_pit --start 2000-01-01 --end 2020-01-01 --interval quarterly --symbol_flt_regx "^(600519|000725).*" ``` + + ### Dump Data into PIT Format +```bash cd qlib/scripts -python dump_pit.py dump --csv_path /data1/v-xiabi/qlib/pit/csv_2 --qlib_dir ~/.qlib/qlib_data/cn_data --interval quarterly \ No newline at end of file +# data_collector/pit/csv_pit is the data you download just now. +python dump_pit.py dump --csv_path data_collector/pit/csv_pit --qlib_dir ~/.qlib/qlib_data/cn_data --interval quarterly +``` diff --git a/scripts/data_collector/pit/collector.py b/scripts/data_collector/pit/collector.py index 6ff7280b42..a7eb2ca1ee 100644 --- a/scripts/data_collector/pit/collector.py +++ b/scripts/data_collector/pit/collector.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +import re import abc import sys import datetime @@ -40,6 +41,7 @@ def __init__( delay=0, check_data_length: bool = False, limit_nums: int = None, + symbol_flt_regx = None, ): """ @@ -62,6 +64,10 @@ def __init__( limit_nums: int using for debug, by default None """ + if symbol_flt_regx is None: + self.symbol_flt_regx = None + else: + self.symbol_flt_regx = re.compile(symbol_flt_regx) super(PitCollector, self).__init__( save_dir=save_dir, start=start, @@ -82,7 +88,16 @@ def normalize_symbol(self, symbol): def get_instrument_list(self): logger.info("get cn stock symbols......") symbols = get_hs_stock_symbols() - logger.info(f"get {len(symbols)} symbols.") + logger.info(f"get {symbols[:10]}[{len(symbols)}] symbols.") + if self.symbol_flt_regx is not None: + s_flt = [] + for s in symbols: + m = self.symbol_flt_regx.match(s) + if m is not None: + s_flt.append(s) + logger.info(f"after filtering, it becomes {s_flt[:10]}[{len(s_flt)}] symbols") + return s_flt + return symbols def _get_data_from_baostock(self, symbol, interval, start_datetime, end_datetime): @@ -95,7 +110,9 @@ def _str_to_float(r): return np.nan try: - symbol = f"{symbol[7:]}.{symbol[:6]}" + code, market = symbol.split('.') + market = {"ss": "sh"}.get(market, market) # baostock's API naming is different from default symbol list + symbol = f"{market}.{code}" rs_report = bs.query_performance_express_report( code=symbol, start_date=str(start_datetime.date()), end_date=str(end_datetime.date()) ) @@ -276,6 +293,7 @@ def download_data( interval="quarterly", check_data_length=False, limit_nums=None, + **kwargs, ): """download data from Internet @@ -302,7 +320,7 @@ def download_data( $ python collector.py download_data --source_dir ~/.qlib/cn_data/source/pit_quarter --start 2000-01-01 --end 2021-01-01 --interval quarterly """ - super(Run, self).download_data(max_collector_count, delay, start, end, interval, check_data_length, limit_nums) + super(Run, self).download_data(max_collector_count, delay, start, end, interval, check_data_length, limit_nums, **kwargs) def normalize_class_name(self): pass diff --git a/scripts/data_collector/pit/test_pit.py b/scripts/data_collector/pit/test_pit.py new file mode 100644 index 0000000000..0e1085182d --- /dev/null +++ b/scripts/data_collector/pit/test_pit.py @@ -0,0 +1,49 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +import qlib +from qlib.data import D +import unittest + + +class TestPIT(unittest.TestCase): + + def setUp(self): + qlib.init() + + def to_str(self, obj): + return "".join(str(obj).split()) + + def test_index_data(self): + instruments = ["sh600519"] + fields = ["$$roewa_q", "$$yoyni_q"] + # Mao Tai published 2019Q2 report at 2019-07-13 & 2019-07-18 + # - http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index + data = D.features(instruments, fields, start_time="2019-01-01", end_time="20190719", freq="day") + + res = ''' + $$roewa_q $$yoyni_q + count 133.000000 133.000000 + mean 0.196412 0.277930 + std 0.097591 0.030262 + min 0.000000 0.243892 + 25% 0.094737 0.243892 + 50% 0.255220 0.304181 + 75% 0.255220 0.305041 + max 0.344644 0.305041 + ''' + self.assertEqual(self.to_str(data.describe()), self.to_str(res)) + + res = ''' + $$roewa_q $$yoyni_q + instrument datetime + sh600519 2019-07-15 0.000000 0.305041 + 2019-07-16 0.000000 0.305041 + 2019-07-17 0.000000 0.305041 + 2019-07-18 0.175322 0.252650 + 2019-07-19 0.175322 0.252650 + ''' + self.assertEqual(self.to_str(data.tail()), self.to_str(res)) + + +if __name__ == "__main__": + unittest.main() diff --git a/setup.py b/setup.py index 83cf6e1b60..5f24943420 100644 --- a/setup.py +++ b/setup.py @@ -104,6 +104,14 @@ }, ext_modules=extensions, install_requires=REQUIRED, + extras_require={ + "dev": [ + 'coverage', + 'pytest>=3', + 'sphinx', + 'sphinx_rtd_theme', + ] + }, include_package_data=True, classifiers=[ # Trove classifiers diff --git a/tests/notest_PIT.py b/tests/notest_PIT.py index cb6e02313e..cc1545e7f2 100644 --- a/tests/notest_PIT.py +++ b/tests/notest_PIT.py @@ -6,24 +6,24 @@ import qlib from qlib.data import D from qlib.tests import TestAutoData -from qlib.config import REG_CN class TestRegiterCustomOps(TestAutoData): @classmethod def setUpClass(cls) -> None: # use default data - provider_uri = "~/.qlib/qlib_data/cn_data_new" # target_dir - qlib.init(provider_uri=provider_uri, region=REG_CN) + qlib.init() def test_regiter_custom_ops(self): - instruments = ["sz000708"] + instruments = ["sh600519"] fields = ["$$roewa_q", "$$yoyni_q"] fields += ["($$roewa_q / $$yoyni_q) / PRef($$roewa_q / $$yoyni_q, 1) - 1"] fields += ["PSum($$yoyni_q, 4)"] fields += ["$close", "$$roewa_q*$close"] - print(D.features(instruments, fields, start_time="2019-01-01", end_time="2020-01-01", freq="day")) + data = D.features(instruments, fields, start_time="2019-01-01", end_time="2020-01-01", freq="day") + print(data) + print(data.describe()) if __name__ == "__main__": From 499738938190406d71b7807915f99e3b8fe03bee Mon Sep 17 00:00:00 2001 From: Young Date: Fri, 4 Mar 2022 18:05:11 +0800 Subject: [PATCH 23/30] Add some docs and remove a bug --- docs/advanced/PIT.rst | 26 +++++++++++++++++++++++++- qlib/data/data.py | 1 - 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/docs/advanced/PIT.rst b/docs/advanced/PIT.rst index c5d441d1b5..12cfd5397f 100644 --- a/docs/advanced/PIT.rst +++ b/docs/advanced/PIT.rst @@ -6,6 +6,30 @@ .. currentmodule:: qlib +Introduction +------------ +Point-in-time data is a very important consideration when performing any sort of historical market analysis. + +For example, let’s say we are backtesting a trading strategy and we are using the past five years of historical data as our input. +Our model is assumed to trade once a day, at the market close, and we’ll say we are calculating the trading signal for 1 January 2020 in our backtest. At that point, we should only have data for 1 January 2020, 31 December 2019, 30 December 2019 etc. + +In financial data (especially financial reports), the same piece of data may be amended for multiple times overtime. If we only use the latest version for historical backtesting, data leakage will happen. +Point-in-time database is designed for solving this problem to make sure user get the right version of data at any historical timestamp. It will keep the performance of online trading and historical backtesting the same. + + + +Data Preparation +---------------- + +Qlib provides a crawler to help users to download financial data and then a converter to dump the data in Qlib format. +Please follow `scripts/data_collector/pit/README.md` to download and convert data. + + +File-based design for PIT data +------------------------------ + +Qlib provides a file-based storage for PIT data. + For each feature, it contains 4 columns, i.e. date, period, value, _next. Each row corresponds to a statement. @@ -17,7 +41,7 @@ The meaning of each feature with filename like `XXX_a.data` - `value`: the described value - `_next`: the byte index of the next occurance of the field. -Besides the feature, a index `XXX_a.index` +Besides the feature data, an index `XXX_a.index` is included to speed up the querying performance The statements are soted by the `date` in ascending order from the beginning of the file. diff --git a/qlib/data/data.py b/qlib/data/data.py index aab04d2e2a..cb36aa4133 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -62,7 +62,6 @@ def backend_obj(self, **kwargs): backend = copy.deepcopy(backend) backend.setdefault("kwargs", {}).update(**kwargs) return init_instance_by_config(backend) ->>>>>>> origin/main class CalendarProvider(abc.ABC): From cf77cd02244217427839ef30e7eb1e2cd689895d Mon Sep 17 00:00:00 2001 From: Young Date: Tue, 8 Mar 2022 12:05:02 +0800 Subject: [PATCH 24/30] mv crypto collector --- scripts/data_collector/crypto/collector.py | 36 +++++++++++++++++++++- scripts/data_collector/utils.py | 33 -------------------- 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/scripts/data_collector/crypto/collector.py b/scripts/data_collector/crypto/collector.py index 0790aa6405..d25568a72a 100644 --- a/scripts/data_collector/crypto/collector.py +++ b/scripts/data_collector/crypto/collector.py @@ -13,7 +13,7 @@ CUR_DIR = Path(__file__).resolve().parent sys.path.append(str(CUR_DIR.parent.parent)) from data_collector.base import BaseCollector, BaseNormalize, BaseRun -from data_collector.utils import get_cg_crypto_symbols +from data_collector.utils import get_cg_crypto_symbols, deco_retry from pycoingecko import CoinGeckoAPI from time import mktime @@ -21,6 +21,40 @@ import time +_CG_CRYPTO_SYMBOLS = None + + +def get_cg_crypto_symbols(qlib_data_path: [str, Path] = None) -> list: + """get crypto symbols in coingecko + + Returns + ------- + crypto symbols in given exchanges list of coingecko + """ + global _CG_CRYPTO_SYMBOLS + + @deco_retry + def _get_coingecko(): + try: + cg = CoinGeckoAPI() + resp = pd.DataFrame(cg.get_coins_markets(vs_currency="usd")) + except: + raise ValueError("request error") + try: + _symbols = resp["id"].to_list() + except Exception as e: + logger.warning(f"request error: {e}") + raise + return _symbols + + if _CG_CRYPTO_SYMBOLS is None: + _all_symbols = _get_coingecko() + + _CG_CRYPTO_SYMBOLS = sorted(set(_all_symbols)) + + return _CG_CRYPTO_SYMBOLS + + class CryptoCollector(BaseCollector): def __init__( self, diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index 33e3a047f5..19131ec29f 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -19,7 +19,6 @@ from tqdm import tqdm from functools import partial from concurrent.futures import ProcessPoolExecutor -from pycoingecko import CoinGeckoAPI HS_SYMBOLS_URL = "http://app.finance.ifeng.com/hq/list.php?type=stock_a&class={s_type}" @@ -43,7 +42,6 @@ _US_SYMBOLS = None _IN_SYMBOLS = None _EN_FUND_SYMBOLS = None -_CG_CRYPTO_SYMBOLS = None _CALENDAR_MAP = {} # NOTE: Until 2020-10-20 20:00:00 @@ -379,37 +377,6 @@ def _get_eastmoney(): return _EN_FUND_SYMBOLS -def get_cg_crypto_symbols(qlib_data_path: [str, Path] = None) -> list: - """get crypto symbols in coingecko - - Returns - ------- - crypto symbols in given exchanges list of coingecko - """ - global _CG_CRYPTO_SYMBOLS - - @deco_retry - def _get_coingecko(): - try: - cg = CoinGeckoAPI() - resp = pd.DataFrame(cg.get_coins_markets(vs_currency="usd")) - except: - raise ValueError("request error") - try: - _symbols = resp["id"].to_list() - except Exception as e: - logger.warning(f"request error: {e}") - raise - return _symbols - - if _CG_CRYPTO_SYMBOLS is None: - _all_symbols = _get_coingecko() - - _CG_CRYPTO_SYMBOLS = sorted(set(_all_symbols)) - - return _CG_CRYPTO_SYMBOLS - - def symbol_suffix_to_prefix(symbol: str, capital: bool = True) -> str: """symbol suffix to prefix From 79422a1166d4c5de22b53b31cac641a89ebf46ce Mon Sep 17 00:00:00 2001 From: Young Date: Tue, 8 Mar 2022 12:06:55 +0800 Subject: [PATCH 25/30] black format --- qlib/data/base.py | 11 ++++++++++- qlib/data/data.py | 2 +- qlib/data/ops.py | 1 + qlib/utils/__init__.py | 11 ++++++----- scripts/data_collector/base.py | 2 +- scripts/data_collector/pit/collector.py | 8 +++++--- scripts/data_collector/pit/test_pit.py | 9 ++++----- setup.py | 8 ++++---- 8 files changed, 32 insertions(+), 20 deletions(-) diff --git a/qlib/data/base.py b/qlib/data/base.py index c9ca045aab..82eb42321a 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -15,7 +15,15 @@ class Expression(abc.ABC): - """Expression base class""" + """ + Expression base class + + Expression is designed to handle the calculation of data with the format below + data with two dimension for each instrument, + - feature + - time: it could be observation time or period time. + - period time is designed for Point-in-time database. For example, the period time maybe 2014Q4, its value can observed for multiple times(different value may be observed at different time due to amendment). + """ def __str__(self): return type(self).__name__ @@ -235,6 +243,7 @@ class ExpressionOps(Expression): This kind of feature will use operator for feature construction on the fly. """ + pass diff --git a/qlib/data/data.py b/qlib/data/data.py index cb36aa4133..7c56a364f2 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -705,7 +705,7 @@ def feature(self, instrument, field, start_index, end_index, freq): return self.backend_obj(instrument=instrument, field=field, freq=freq)[start_index : end_index + 1] def period_feature(self, instrument, field, start_offset, end_offset, cur_date, **kwargs): - """get the historical periods data series for `start_offset` and `end_offset` """ + """get the historical periods data series for `start_offset` and `end_offset`""" DATA_RECORDS = [ ("date", C.pit_record_type["date"]), diff --git a/qlib/data/ops.py b/qlib/data/ops.py index 8932c17d96..8af05175b4 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -1649,6 +1649,7 @@ def register_all_ops(C): # FIXME: I don't think it is necessary from .ops_period import PeriodOpsList + Operators.register(PeriodOpsList) if getattr(C, "custom_ops", None) is not None: diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index ca59d774ca..06956b1446 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -29,7 +29,7 @@ import numpy as np import pandas as pd from pathlib import Path -from typing import List, Dict, Union, Tuple, Any, Text, Optional, Callable +from typing import List, Dict, Union, Tuple, Any, Text, Optional, Callable from types import ModuleType from urllib.parse import urlparse from .file import get_or_create_path, save_multiple_parts_file, unpack_archive_with_buffer, get_tmp_file_with_buffer @@ -79,7 +79,6 @@ def get_period_list(first: int, last: int, quarterly: bool) -> List[int]: the possible index between [first, last] """ - if not quarterly: assert all(1900 <= x <= 2099 for x in (first, last)), "invalid arguments" return list(range(first, last + 1)) @@ -262,9 +261,11 @@ def parse_field(field): if not isinstance(field, str): field = str(field) - for pattern, new in [(r"\$\$(\w+)", r'PFeature("\1")'), # $$ must be before $ - (r"\$(\w+)", rf'Feature("\1")'), - (r"(\w+\s*)\(", r"Operators.\1(")]: # Features # Operators + for pattern, new in [ + (r"\$\$(\w+)", r'PFeature("\1")'), # $$ must be before $ + (r"\$(\w+)", rf'Feature("\1")'), + (r"(\w+\s*)\(", r"Operators.\1("), + ]: # Features # Operators field = re.sub(pattern, new, field) return field diff --git a/scripts/data_collector/base.py b/scripts/data_collector/base.py index ddcb879f39..236d9cddfe 100644 --- a/scripts/data_collector/base.py +++ b/scripts/data_collector/base.py @@ -399,7 +399,7 @@ def download_data( interval=self.interval, check_data_length=check_data_length, limit_nums=limit_nums, - **kwargs + **kwargs, ).collector_data() def normalize_data(self, date_field_name: str = "date", symbol_field_name: str = "symbol", **kwargs): diff --git a/scripts/data_collector/pit/collector.py b/scripts/data_collector/pit/collector.py index a7eb2ca1ee..45e1f984eb 100644 --- a/scripts/data_collector/pit/collector.py +++ b/scripts/data_collector/pit/collector.py @@ -41,7 +41,7 @@ def __init__( delay=0, check_data_length: bool = False, limit_nums: int = None, - symbol_flt_regx = None, + symbol_flt_regx=None, ): """ @@ -110,7 +110,7 @@ def _str_to_float(r): return np.nan try: - code, market = symbol.split('.') + code, market = symbol.split(".") market = {"ss": "sh"}.get(market, market) # baostock's API naming is different from default symbol list symbol = f"{market}.{code}" rs_report = bs.query_performance_express_report( @@ -320,7 +320,9 @@ def download_data( $ python collector.py download_data --source_dir ~/.qlib/cn_data/source/pit_quarter --start 2000-01-01 --end 2021-01-01 --interval quarterly """ - super(Run, self).download_data(max_collector_count, delay, start, end, interval, check_data_length, limit_nums, **kwargs) + super(Run, self).download_data( + max_collector_count, delay, start, end, interval, check_data_length, limit_nums, **kwargs + ) def normalize_class_name(self): pass diff --git a/scripts/data_collector/pit/test_pit.py b/scripts/data_collector/pit/test_pit.py index 0e1085182d..c461702533 100644 --- a/scripts/data_collector/pit/test_pit.py +++ b/scripts/data_collector/pit/test_pit.py @@ -6,7 +6,6 @@ class TestPIT(unittest.TestCase): - def setUp(self): qlib.init() @@ -20,7 +19,7 @@ def test_index_data(self): # - http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index data = D.features(instruments, fields, start_time="2019-01-01", end_time="20190719", freq="day") - res = ''' + res = """ $$roewa_q $$yoyni_q count 133.000000 133.000000 mean 0.196412 0.277930 @@ -30,10 +29,10 @@ def test_index_data(self): 50% 0.255220 0.304181 75% 0.255220 0.305041 max 0.344644 0.305041 - ''' + """ self.assertEqual(self.to_str(data.describe()), self.to_str(res)) - res = ''' + res = """ $$roewa_q $$yoyni_q instrument datetime sh600519 2019-07-15 0.000000 0.305041 @@ -41,7 +40,7 @@ def test_index_data(self): 2019-07-17 0.000000 0.305041 2019-07-18 0.175322 0.252650 2019-07-19 0.175322 0.252650 - ''' + """ self.assertEqual(self.to_str(data.tail()), self.to_str(res)) diff --git a/setup.py b/setup.py index 6e7970c0a3..2bd3f0410c 100644 --- a/setup.py +++ b/setup.py @@ -128,10 +128,10 @@ def get_version(rel_path: str) -> str: install_requires=REQUIRED, extras_require={ "dev": [ - 'coverage', - 'pytest>=3', - 'sphinx', - 'sphinx_rtd_theme', + "coverage", + "pytest>=3", + "sphinx", + "sphinx_rtd_theme", ] }, include_package_data=True, From 48ea2c52e62083f87e06f2b7fdc76497dc78ff50 Mon Sep 17 00:00:00 2001 From: Young Date: Tue, 8 Mar 2022 22:08:49 +0800 Subject: [PATCH 26/30] Run succesfully after merging master --- qlib/config.py | 20 +----------- qlib/data/__init__.py | 1 + qlib/data/base.py | 23 ++++++------- qlib/data/data.py | 45 +++++++++++++++++++------- qlib/data/ops.py | 10 +++--- scripts/data_collector/pit/test_pit.py | 37 +++++++++++++++++++-- 6 files changed, 84 insertions(+), 52 deletions(-) diff --git a/qlib/config.py b/qlib/config.py index ed10ea8b8a..48f0a325c3 100644 --- a/qlib/config.py +++ b/qlib/config.py @@ -92,6 +92,7 @@ def set_conf_from_C(self, config_c): "calendar_provider": "LocalCalendarProvider", "instrument_provider": "LocalInstrumentProvider", "feature_provider": "LocalFeatureProvider", + "pit_provider": "LocalPITProvider", "expression_provider": "LocalExpressionProvider", "dataset_provider": "LocalDatasetProvider", "provider": "LocalProvider", @@ -108,7 +109,6 @@ def set_conf_from_C(self, config_c): "provider_uri": "", # cache "expression_cache": None, - "dataset_cache": None, "calendar_cache": None, # for simple dataset cache "local_cache_path": None, @@ -196,20 +196,12 @@ def set_conf_from_C(self, config_c): MODE_CONF = { "server": { - # data provider config - "calendar_provider": "LocalCalendarProvider", - "instrument_provider": "LocalInstrumentProvider", - "feature_provider": "LocalFeatureProvider", - "expression_provider": "LocalExpressionProvider", - "dataset_provider": "LocalDatasetProvider", - "provider": "LocalProvider", # config it in qlib.init() "provider_uri": "", # redis "redis_host": "127.0.0.1", "redis_port": 6379, "redis_task_db": 1, - "kernels": NUM_USABLE_CPU, # cache "expression_cache": DISK_EXPRESSION_CACHE, "dataset_cache": DISK_DATASET_CACHE, @@ -217,25 +209,15 @@ def set_conf_from_C(self, config_c): "mount_path": None, }, "client": { - # data provider config - "calendar_provider": "LocalCalendarProvider", - "instrument_provider": "LocalInstrumentProvider", - "feature_provider": "LocalFeatureProvider", - "expression_provider": "LocalExpressionProvider", - "dataset_provider": "LocalDatasetProvider", - "provider": "LocalProvider", # config it in user's own code "provider_uri": "~/.qlib/qlib_data/cn_data", # cache # Using parameter 'remote' to announce the client is using server_cache, and the writing access will be disabled. # Disable cache by default. Avoid introduce advanced features for beginners - "expression_cache": None, "dataset_cache": None, # SimpleDatasetCache directory "local_cache_path": Path("~/.cache/qlib_simple_cache").expanduser().resolve(), - "calendar_cache": None, # client config - "kernels": NUM_USABLE_CPU, "mount_path": None, "auto_mount": False, # The nfs is already mounted on our server[auto_mount: False]. # The nfs should be auto-mounted by qlib on other diff --git a/qlib/data/__init__.py b/qlib/data/__init__.py index ef5fe4708e..4baf6c72a0 100644 --- a/qlib/data/__init__.py +++ b/qlib/data/__init__.py @@ -15,6 +15,7 @@ LocalCalendarProvider, LocalInstrumentProvider, LocalFeatureProvider, + LocalPITProvider, LocalExpressionProvider, LocalDatasetProvider, ClientCalendarProvider, diff --git a/qlib/data/base.py b/qlib/data/base.py index 82eb42321a..e0a0303d79 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -505,9 +505,13 @@ def load(self, instrument, start_index, end_index, freq): # To load expression accurately, more historical data are required start_offset = self.get_period_offset(cur_index) # The calculated value will always the last element, so the end_offset is zero. - resample_data[cur_index - start_index] = self.load_period_data( - instrument, start_offset, 0, cur_date, info=(start_index, end_index, cur_index) - ).iloc[-1] + try: + resample_data[cur_index - start_index] = self.load_period_data( + instrument, start_offset, 0, cur_date, info=(start_index, end_index, cur_index) + ).iloc[-1] + except FileNotFoundError: + get_module_logger("base").warning(f"WARN: period data not found for {str(self)}") + return pd.Series(dtype="float32", name=str(self)) resample_series = pd.Series( resample_data, index=pd.RangeIndex(start_index, end_index + 1), dtype="float32", name=str(self) @@ -532,21 +536,12 @@ def __init__(self, name=None): def __str__(self): return "$$" + self._name - def check_feature_exist(self, instrument): - from .data import FeatureD - - instrument = code_to_fname(instrument).lower() - index_path = FeatureD.uri_period_index.format(instrument, self._name) - data_path = FeatureD.uri_period_data.format(instrument, self._name) - - return os.path.exists(index_path) and os.path.exists(data_path) - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): # BUG: cur_idnex is a date!!!!! ### Zhou Code - from .data import FeatureD + from .data import PITD - return FeatureD.period_feature(instrument, str(self), start_offset, end_offset, cur_index, **kwargs) + return PITD.period_feature(instrument, str(self), start_offset, end_offset, cur_index, **kwargs) # return pd.Series([1, 2, 3]) # fot test def get_period_offset(self, cur_index): diff --git a/qlib/data/data.py b/qlib/data/data.py index 7c56a364f2..5cf095cbcd 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -6,6 +6,7 @@ from __future__ import print_function import re +import os import abc import copy import queue @@ -334,6 +335,20 @@ def feature(self, instrument, field, start_time, end_time, freq): raise NotImplementedError("Subclass of FeatureProvider must implement `feature` method") +class PITProvider(abc.ABC): + @abc.abstractmethod + def period_feature(self, instrument, field, start_offset, end_offset, cur_date, **kwargs): + """ + get the historical periods data series for `start_offset` and `end_offset` + + Raises + ------ + FileNotFoundError + This exception will be raised if the queried data do not exist. + """ + raise NotImplementedError(f"Please implement the `period_feature` method") + + class ExpressionProvider(abc.ABC): """Expression provider class @@ -690,23 +705,17 @@ def __init__(self, remote=False, backend={}): self.remote = remote self.backend = backend - @property - def uri_period_index(self): - return os.path.join(C.get_data_path(), "financial", "{}", "{}.index") - - @property - def uri_period_data(self): - return os.path.join(C.get_data_path(), "financial", "{}", "{}.data") - def feature(self, instrument, field, start_index, end_index, freq): # validate field = str(field)[1:] instrument = code_to_fname(instrument) return self.backend_obj(instrument=instrument, field=field, freq=freq)[start_index : end_index + 1] - def period_feature(self, instrument, field, start_offset, end_offset, cur_date, **kwargs): - """get the historical periods data series for `start_offset` and `end_offset`""" +class LocalPITProvider(PITProvider): + # TODO: Add PIT backend file storage + + def period_feature(self, instrument, field, start_offset, end_offset, cur_date, **kwargs): DATA_RECORDS = [ ("date", C.pit_record_type["date"]), ("period", C.pit_record_type["period"]), @@ -731,8 +740,10 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_date, if not field.endswith("_q") and not field.endswith("_a"): raise ValueError("period field must ends with '_q' or '_a'") quarterly = field.endswith("_q") - index_path = self.uri_period_index.format(instrument.lower(), field) - data_path = self.uri_period_data.format(instrument.lower(), field) + index_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.index" + data_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.data" + if not (index_path.exists() and data_path.exists()): + raise FileNotFoundError("No file is found. Raise exception and ") data = np.fromfile(data_path, dtype=DATA_RECORDS) # find all revision periods before `cur_date` @@ -1071,6 +1082,8 @@ def dataset( class BaseProvider: """Local provider class + It is a set of interface that allow users to access data. + Because PITD is not exposed publicly to users, so it is not included in the interface. To keep compatible with old qlib provider. """ @@ -1194,6 +1207,7 @@ def is_instance_of_provider(instance: object, cls: type): CalendarProviderWrapper = Annotated[CalendarProvider, Wrapper] InstrumentProviderWrapper = Annotated[InstrumentProvider, Wrapper] FeatureProviderWrapper = Annotated[FeatureProvider, Wrapper] + PITProviderWrapper = Annotated[PITProvider, Wrapper] ExpressionProviderWrapper = Annotated[ExpressionProvider, Wrapper] DatasetProviderWrapper = Annotated[DatasetProvider, Wrapper] BaseProviderWrapper = Annotated[BaseProvider, Wrapper] @@ -1201,6 +1215,7 @@ def is_instance_of_provider(instance: object, cls: type): CalendarProviderWrapper = CalendarProvider InstrumentProviderWrapper = InstrumentProvider FeatureProviderWrapper = FeatureProvider + PITProviderWrapper = PITProvider ExpressionProviderWrapper = ExpressionProvider DatasetProviderWrapper = DatasetProvider BaseProviderWrapper = BaseProvider @@ -1208,6 +1223,7 @@ def is_instance_of_provider(instance: object, cls: type): Cal: CalendarProviderWrapper = Wrapper() Inst: InstrumentProviderWrapper = Wrapper() FeatureD: FeatureProviderWrapper = Wrapper() +PITD: PITProviderWrapper = Wrapper() ExpressionD: ExpressionProviderWrapper = Wrapper() DatasetD: DatasetProviderWrapper = Wrapper() D: BaseProviderWrapper = Wrapper() @@ -1233,6 +1249,11 @@ def register_all_wrappers(C): register_wrapper(FeatureD, feature_provider, "qlib.data") logger.debug(f"registering FeatureD {C.feature_provider}") + if getattr(C, "pit_provider", None) is not None: + pit_provider = init_instance_by_config(C.pit_provider, module) + register_wrapper(PITD, pit_provider, "qlib.data") + logger.debug(f"registering PITD {C.pit_provider}") + if getattr(C, "expression_provider", None) is not None: # This provider is unnecessary in client provider _eprovider = init_instance_by_config(C.expression_provider, module) diff --git a/qlib/data/ops.py b/qlib/data/ops.py index 8af05175b4..a5076a8b31 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -10,7 +10,7 @@ from typing import Union, List, Type from scipy.stats import percentileofscore -from .base import Expression, ExpressionOps, Feature, PExpression +from .base import Expression, ExpressionOps, Feature, PExpression, PFeature from ..log import get_module_logger from ..utils import get_callable_kwargs @@ -1588,6 +1588,7 @@ def _load_internal(self, instrument, start_index, end_index, freq): IdxMin, If, Feature, + PFeature, ] + [TResample] @@ -1620,7 +1621,10 @@ def register(self, ops_list: List[Union[Type[ExpressionOps], dict]]): else: _ops_class = _operator - if not issubclass(_ops_class, Expression): + # FIXME: remove PExpression + from .ops_period import PExpression + + if not issubclass(_ops_class, (Expression, PExpression)): raise TypeError("operator must be subclass of ExpressionOps, not {}".format(_ops_class)) if _ops_class.__name__ in self._ops: @@ -1642,8 +1646,6 @@ def register_all_ops(C): """register all operator""" logger = get_module_logger("ops") - from .base import Operators - # Operators.reset() Operators.register(OpsList) diff --git a/scripts/data_collector/pit/test_pit.py b/scripts/data_collector/pit/test_pit.py index c461702533..befc1acc13 100644 --- a/scripts/data_collector/pit/test_pit.py +++ b/scripts/data_collector/pit/test_pit.py @@ -6,12 +6,21 @@ class TestPIT(unittest.TestCase): + """ + NOTE!!!!!! + The assert of this test assumes that users follows the cmd below and only download 2 stock. + `python collector.py download_data --source_dir ./csv_pit --start 2000-01-01 --end 2020-01-01 --interval quarterly --symbol_flt_regx "^(600519|000725).*"` + """ + def setUp(self): - qlib.init() + qlib.init(kernels=1) # NOTE: set kernel to 1 to make it debug easier def to_str(self, obj): return "".join(str(obj).split()) + def check_same(self, a, b): + self.assertEqual(self.to_str(a), self.to_str(b)) + def test_index_data(self): instruments = ["sh600519"] fields = ["$$roewa_q", "$$yoyni_q"] @@ -30,7 +39,7 @@ def test_index_data(self): 75% 0.255220 0.305041 max 0.344644 0.305041 """ - self.assertEqual(self.to_str(data.describe()), self.to_str(res)) + self.check_same(data.describe(), res) res = """ $$roewa_q $$yoyni_q @@ -41,7 +50,29 @@ def test_index_data(self): 2019-07-18 0.175322 0.252650 2019-07-19 0.175322 0.252650 """ - self.assertEqual(self.to_str(data.tail()), self.to_str(res)) + self.check_same(data.tail(), res) + + def test_no_exist_data(self): + fields = ["$$roewa_q", "$$yoyni_q", "$close"] + data = D.features(["sh600519", "sz000858"], fields, start_time="2019-01-01", end_time="20190719", freq="day") + expect = """ + $$roewa_q $$yoyni_q $close + instrument datetime + sh600519 2019-01-02 0.25522 0.243892 124.290070 + 2019-01-03 0.25522 0.243892 122.426697 + 2019-01-04 0.25522 0.243892 124.916748 + 2019-01-07 0.25522 0.243892 125.640930 + 2019-01-08 0.25522 0.243892 125.495667 + ... ... ... ... + sz000858 2019-07-15 NaN NaN 43.153912 + 2019-07-16 NaN NaN 42.632988 + 2019-07-17 NaN NaN 42.639885 + 2019-07-18 NaN NaN 41.742931 + 2019-07-19 NaN NaN 42.136211 + + [266 rows x 3 columns] + """ + self.check_same(data, expect) if __name__ == "__main__": From 9c67303bea05ab4bf6bf60d5853f49c65bf1bb5d Mon Sep 17 00:00:00 2001 From: Young Date: Thu, 10 Mar 2022 11:51:46 +0800 Subject: [PATCH 27/30] Pass test and fix code --- qlib/data/base.py | 91 +++++++++----- qlib/data/data.py | 94 ++++++++++---- qlib/data/ops.py | 104 ++++++++-------- qlib/data/pit.py | 60 +++++++++ qlib/utils/__init__.py | 14 ++- scripts/data_collector/pit/test_pit.py | 166 ++++++++++++++++++++----- scripts/dump_pit.py | 5 + 7 files changed, 391 insertions(+), 143 deletions(-) create mode 100644 qlib/data/pit.py diff --git a/qlib/data/base.py b/qlib/data/base.py index e0a0303d79..75557db7f1 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -136,8 +136,18 @@ def __ror__(self, other): return Or(other, self) - def load(self, instrument, start_index, end_index, freq): + def load(self, instrument, start_index, end_index, *args): """load feature + This function is responsible for loading feature/expression based on the expression engine. + + The concerate implementation will be seperated by two parts + 1) caching data, handle errors. + - This part is shared by all the expressions and implemented in Expression + 2) processing and calculating data based on the specific expression. + - This part is different in each expression and implemented in each expression + + Expresion Engine is shared by different data. + Different data will have different extra infomation for `args`. Parameters ---------- @@ -147,8 +157,15 @@ def load(self, instrument, start_index, end_index, freq): feature start index [in calendar]. end_index : str feature end index [in calendar]. - freq : str - feature frequency. + + *args may contains following information; + 1) if it is used in basic experssion engine data, it contains following arguments + freq : str + feature frequency. + + 2) if is used in PIT data, it contains following arguments + cur_pit: + it is designed for the point-in-time data. Returns ---------- @@ -158,26 +175,26 @@ def load(self, instrument, start_index, end_index, freq): from .cache import H # pylint: disable=C0415 # cache - args = str(self), instrument, start_index, end_index, freq - if args in H["f"]: - return H["f"][args] + cache_key = str(self), instrument, start_index, end_index, *args + if cache_key in H["f"]: + return H["f"][cache_key] if start_index is not None and end_index is not None and start_index > end_index: raise ValueError("Invalid index range: {} {}".format(start_index, end_index)) try: - series = self._load_internal(instrument, start_index, end_index, freq) + series = self._load_internal(instrument, start_index, end_index, *args) except Exception as e: get_module_logger("data").debug( f"Loading data error: instrument={instrument}, expression={str(self)}, " - f"start_index={start_index}, end_index={end_index}, freq={freq}. " + f"start_index={start_index}, end_index={end_index}, args={args}. " f"error info: {str(e)}" ) raise series.name = str(self) - H["f"][args] = series + H["f"][cache_key] = series return series @abc.abstractmethod - def _load_internal(self, instrument, start_index, end_index, freq): + def _load_internal(self, instrument, start_index, end_index, *args) -> pd.Series: raise NotImplementedError("This function must be implemented in your newly defined feature") @abc.abstractmethod @@ -237,6 +254,16 @@ def get_extended_window_size(self): return 0, 0 +class PFeature(Feature): + def __str__(self): + return "$$" + self._name + + def _load_internal(self, instrument, start_index, end_index, cur_time): + from .data import PITD # pylint: disable=C0415 + + return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time) + + class ExpressionOps(Expression): """Operator Expression @@ -501,13 +528,13 @@ def load(self, instrument, start_index, end_index, freq): resample_data = np.empty(end_index - start_index + 1, dtype="float32") for cur_index in range(start_index, end_index + 1): - cur_date = _calendar[cur_index] + cur_time = _calendar[cur_index] # To load expression accurately, more historical data are required start_offset = self.get_period_offset(cur_index) # The calculated value will always the last element, so the end_offset is zero. try: resample_data[cur_index - start_index] = self.load_period_data( - instrument, start_offset, 0, cur_date, info=(start_index, end_index, cur_index) + instrument, start_offset, 0, cur_time, info=(start_index, end_index, cur_index) ).iloc[-1] except FileNotFoundError: get_module_logger("base").warning(f"WARN: period data not found for {str(self)}") @@ -526,26 +553,26 @@ def get_extended_window_size(self): return 0, 0 -class PFeature(PExpression): - def __init__(self, name=None): - if name: - self._name = name.lower() - else: - self._name = type(self).__name__.lower() - - def __str__(self): - return "$$" + self._name - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - # BUG: cur_idnex is a date!!!!! - ### Zhou Code - from .data import PITD - - return PITD.period_feature(instrument, str(self), start_offset, end_offset, cur_index, **kwargs) - # return pd.Series([1, 2, 3]) # fot test - - def get_period_offset(self, cur_index): - return 0 +# class PFeature(PExpression): +# def __init__(self, name=None): +# if name: +# self._name = name.lower() +# else: +# self._name = type(self).__name__.lower() +# +# def __str__(self): +# return "$$" + self._name +# +# def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): +# # BUG: cur_idnex is a date!!!!! +# ### Zhou Code +# from .data import PITD +# +# return PITD.period_feature(instrument, str(self), start_offset, end_offset, cur_index, **kwargs) +# # return pd.Series([1, 2, 3]) # fot test +# +# def get_period_offset(self, cur_index): +# return 0 class PExpressionOps(PExpression): diff --git a/qlib/data/data.py b/qlib/data/data.py index 5cf095cbcd..aef27dbb25 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -337,9 +337,27 @@ def feature(self, instrument, field, start_time, end_time, freq): class PITProvider(abc.ABC): @abc.abstractmethod - def period_feature(self, instrument, field, start_offset, end_offset, cur_date, **kwargs): + def period_feature(self, instrument, field, start_index: int, end_index: int, cur_time: pd.Timestamp) -> pd.Series: """ - get the historical periods data series for `start_offset` and `end_offset` + get the historical periods data series between `start_index` and `end_index` + + Parameters + ---------- + start_index: int + start_index is a relative index to the latest period to cur_time + + end_index: int + end_index is a relative index to the latest period to cur_time + in most cases, the start_index and end_index will be a non-positive values + For example, start_index == -3 end_index == 0 and current period index is cur_idx, + then the data between [start_index + cur_idx, end_index + cur_idx] will be retrieved. + + Returns + ------- + pd.Series + The index will be integers to indicate the periods of the data + An typical examples will be + TODO Raises ------ @@ -714,8 +732,16 @@ def feature(self, instrument, field, start_index, end_index, freq): class LocalPITProvider(PITProvider): # TODO: Add PIT backend file storage + # NOTE: This class is not multi-threading-safe!!!! + + def period_feature(self, instrument, field, start_index, end_index, cur_time): + if not isinstance(cur_time, pd.Timestamp): + raise ValueError( + f"Expected pd.Timestamp for `cur_time`, got '{cur_time}'. Advices: you can't query PIT data directly(e.g. '$$roewa_q'), you must use `P` operator to convert data to each day (e.g. 'P($$roewa_q)')" + ) + + assert end_index <= 0 # PIT don't support querying future data - def period_feature(self, instrument, field, start_offset, end_offset, cur_date, **kwargs): DATA_RECORDS = [ ("date", C.pit_record_type["date"]), ("period", C.pit_record_type["period"]), @@ -727,15 +753,17 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_date, field = str(field).lower()[2:] instrument = code_to_fname(instrument) - start_index, end_index, cur_index = kwargs["info"] - if cur_index == start_index: - if not hasattr(self, "all_fields"): - self.all_fields = [] - self.all_fields.append(field) - if not hasattr(self, "period_index"): - self.period_index = {} - if field not in self.period_index: - self.period_index[field] = {} + # {For acceleration + # start_index, end_index, cur_index = kwargs["info"] + # if cur_index == start_index: + # if not hasattr(self, "all_fields"): + # self.all_fields = [] + # self.all_fields.append(field) + # if not hasattr(self, "period_index"): + # self.period_index = {} + # if field not in self.period_index: + # self.period_index[field] = {} + # For acceleration} if not field.endswith("_q") and not field.endswith("_a"): raise ValueError("period field must ends with '_q' or '_a'") @@ -744,31 +772,43 @@ def period_feature(self, instrument, field, start_offset, end_offset, cur_date, data_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.data" if not (index_path.exists() and data_path.exists()): raise FileNotFoundError("No file is found. Raise exception and ") + # NOTE: The most significant performance loss is here. + # Does the accelration that makes the program complicated really matters? + # - It make parameters parameters of the interface complicate + # - It does not performance in the optimal way (places all the pieces together, we may achieve higher performance) + # - If we design it carefully, we can go through for only once to get the historical evolution of the data. + # So I decide to deprecated previous implementation and keep the logic of the program simple + # Instead, I'll add a cache for the index file. data = np.fromfile(data_path, dtype=DATA_RECORDS) - # find all revision periods before `cur_date` - cur_date = int(cur_date.year) * 10000 + int(cur_date.month) * 100 + int(cur_date.day) - loc = np.searchsorted(data["date"], cur_date, side="right") + # find all revision periods before `cur_time` + cur_time_int = int(cur_time.year) * 10000 + int(cur_time.month) * 100 + int(cur_time.day) + loc = np.searchsorted(data["date"], cur_time_int, side="right") if loc <= 0: - return C.pit_record_nan["value"] - last_period = data["period"][loc - start_offset - 1 : loc - end_offset].max() # return the latest quarter - first_period = data["period"][loc - start_offset - 1 : loc - end_offset].min() + return pd.Series() + last_period = data["period"][:loc].max() # return the latest quarter + first_period = data["period"][:loc].min() period_list = get_period_list(first_period, last_period, quarterly) - value = np.empty(len(period_list), dtype=VALUE_DTYPE) + period_list = period_list[max(0, len(period_list) + start_index - 1) : len(period_list) + end_index] + value = np.full((len(period_list),), np.nan, dtype=VALUE_DTYPE) for i, period in enumerate(period_list): - last_period_index = self.period_index[field].get(period) + # last_period_index = self.period_index[field].get(period) # For acceleration value[i], now_period_index = read_period_data( - index_path, data_path, period, cur_date, quarterly, last_period_index + index_path, data_path, period, cur_time_int, quarterly # , last_period_index # For acceleration ) - self.period_index[field].update({period: now_period_index}) + # self.period_index[field].update({period: now_period_index}) # For acceleration + # NOTE: the index is period_list; So it may result in unexpected values(e.g. nan) + # when calculation between different features and only part of its financial indicator is published series = pd.Series(value, index=period_list, dtype=VALUE_DTYPE) - if cur_index == end_index: - self.all_fields.remove(field) - if not len(self.all_fields): - del self.all_fields - del self.period_index + # {For acceleration + # if cur_index == end_index: + # self.all_fields.remove(field) + # if not len(self.all_fields): + # del self.all_fields + # del self.period_index + # For acceleration} return series diff --git a/qlib/data/ops.py b/qlib/data/ops.py index a5076a8b31..d3a5baffdd 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -82,8 +82,8 @@ def __init__(self, feature, func): self.func = func super(NpElemOperator, self).__init__(feature) - def _load_internal(self, instrument, start_index, end_index, freq): - series = self.feature.load(instrument, start_index, end_index, freq) + def _load_internal(self, instrument, start_index, end_index, *args): + series = self.feature.load(instrument, start_index, end_index, *args) return getattr(np, self.func)(series) @@ -122,11 +122,11 @@ class Sign(NpElemOperator): def __init__(self, feature): super(Sign, self).__init__(feature, "sign") - def _load_internal(self, instrument, start_index, end_index, freq): + def _load_internal(self, instrument, start_index, end_index, *args): """ To avoid error raised by bool type input, we transform the data into float32. """ - series = self.feature.load(instrument, start_index, end_index, freq) + series = self.feature.load(instrument, start_index, end_index, *args) # TODO: More precision types should be configurable series = series.astype(np.float32) return getattr(np, self.func)(series) @@ -171,8 +171,8 @@ def __init__(self, feature, exponent): def __str__(self): return "{}({},{})".format(type(self).__name__, self.feature, self.exponent) - def _load_internal(self, instrument, start_index, end_index, freq): - series = self.feature.load(instrument, start_index, end_index, freq) + def _load_internal(self, instrument, start_index, end_index, *args): + series = self.feature.load(instrument, start_index, end_index, *args) return getattr(np, self.func)(series, self.exponent) @@ -199,8 +199,8 @@ def __init__(self, feature, instrument): def __str__(self): return "{}({},{})".format(type(self).__name__, self.feature, self.instrument.lower()) - def _load_internal(self, instrument, start_index, end_index, freq): - return self.feature.load(self.instrument, start_index, end_index, freq) + def _load_internal(self, instrument, start_index, end_index, *args): + return self.feature.load(self.instrument, start_index, end_index, *args) class Not(NpElemOperator): @@ -296,16 +296,16 @@ def __init__(self, feature_left, feature_right, func): self.func = func super(NpPairOperator, self).__init__(feature_left, feature_right) - def _load_internal(self, instrument, start_index, end_index, freq): + def _load_internal(self, instrument, start_index, end_index, *args): assert any( [isinstance(self.feature_left, (Expression, PExpression)), self.feature_right, Expression] ), "at least one of two inputs is Expression instance" if isinstance(self.feature_left, (Expression, PExpression)): - series_left = self.feature_left.load(instrument, start_index, end_index, freq) + series_left = self.feature_left.load(instrument, start_index, end_index, *args) else: series_left = self.feature_left # numeric value if isinstance(self.feature_right, (Expression, PExpression)): - series_right = self.feature_right.load(instrument, start_index, end_index, freq) + series_right = self.feature_right.load(instrument, start_index, end_index, *args) else: series_right = self.feature_right check_length = isinstance(series_left, (np.ndarray, pd.Series)) and isinstance( @@ -635,14 +635,14 @@ def __init__(self, condition, feature_left, feature_right): def __str__(self): return "If({},{},{})".format(self.condition, self.feature_left, self.feature_right) - def _load_internal(self, instrument, start_index, end_index, freq): - series_cond = self.condition.load(instrument, start_index, end_index, freq) + def _load_internal(self, instrument, start_index, end_index, *args): + series_cond = self.condition.load(instrument, start_index, end_index, *args) if isinstance(self.feature_left, (Expression, PExpression)): - series_left = self.feature_left.load(instrument, start_index, end_index, freq) + series_left = self.feature_left.load(instrument, start_index, end_index, *args) else: series_left = self.feature_left if isinstance(self.feature_right, (Expression, PExpression)): - series_right = self.feature_right.load(instrument, start_index, end_index, freq) + series_right = self.feature_right.load(instrument, start_index, end_index, *args) else: series_right = self.feature_right series = pd.Series(np.where(series_cond, series_left, series_right), index=series_cond.index) @@ -717,8 +717,8 @@ def __init__(self, feature, N, func): def __str__(self): return "{}({},{})".format(type(self).__name__, self.feature, self.N) - def _load_internal(self, instrument, start_index, end_index, freq): - series = self.feature.load(instrument, start_index, end_index, freq) + def _load_internal(self, instrument, start_index, end_index, *args): + series = self.feature.load(instrument, start_index, end_index, *args) # NOTE: remove all null check, # now it's user's responsibility to decide whether use features in null days # isnull = series.isnull() # NOTE: isnull = NaN, inf is not null @@ -775,8 +775,8 @@ class Ref(Rolling): def __init__(self, feature, N): super(Ref, self).__init__(feature, N, "ref") - def _load_internal(self, instrument, start_index, end_index, freq): - series = self.feature.load(instrument, start_index, end_index, freq) + def _load_internal(self, instrument, start_index, end_index, *args): + series = self.feature.load(instrument, start_index, end_index, *args) # N = 0, return first day if series.empty: return series # Pandas bug, see: https://github.com/pandas-dev/pandas/issues/21049 @@ -965,8 +965,8 @@ class IdxMax(Rolling): def __init__(self, feature, N): super(IdxMax, self).__init__(feature, N, "idxmax") - def _load_internal(self, instrument, start_index, end_index, freq): - series = self.feature.load(instrument, start_index, end_index, freq) + def _load_internal(self, instrument, start_index, end_index, *args): + series = self.feature.load(instrument, start_index, end_index, *args) if self.N == 0: series = series.expanding(min_periods=1).apply(lambda x: x.argmax() + 1, raw=True) else: @@ -1013,8 +1013,8 @@ class IdxMin(Rolling): def __init__(self, feature, N): super(IdxMin, self).__init__(feature, N, "idxmin") - def _load_internal(self, instrument, start_index, end_index, freq): - series = self.feature.load(instrument, start_index, end_index, freq) + def _load_internal(self, instrument, start_index, end_index, *args): + series = self.feature.load(instrument, start_index, end_index, *args) if self.N == 0: series = series.expanding(min_periods=1).apply(lambda x: x.argmin() + 1, raw=True) else: @@ -1045,8 +1045,8 @@ def __init__(self, feature, N, qscore): def __str__(self): return "{}({},{},{})".format(type(self).__name__, self.feature, self.N, self.qscore) - def _load_internal(self, instrument, start_index, end_index, freq): - series = self.feature.load(instrument, start_index, end_index, freq) + def _load_internal(self, instrument, start_index, end_index, *args): + series = self.feature.load(instrument, start_index, end_index, *args) if self.N == 0: series = series.expanding(min_periods=1).quantile(self.qscore) else: @@ -1093,8 +1093,8 @@ class Mad(Rolling): def __init__(self, feature, N): super(Mad, self).__init__(feature, N, "mad") - def _load_internal(self, instrument, start_index, end_index, freq): - series = self.feature.load(instrument, start_index, end_index, freq) + def _load_internal(self, instrument, start_index, end_index, *args): + series = self.feature.load(instrument, start_index, end_index, *args) # TODO: implement in Cython def mad(x): @@ -1127,8 +1127,8 @@ class Rank(Rolling): def __init__(self, feature, N): super(Rank, self).__init__(feature, N, "rank") - def _load_internal(self, instrument, start_index, end_index, freq): - series = self.feature.load(instrument, start_index, end_index, freq) + def _load_internal(self, instrument, start_index, end_index, *args): + series = self.feature.load(instrument, start_index, end_index, *args) # TODO: implement in Cython def rank(x): @@ -1185,8 +1185,8 @@ class Delta(Rolling): def __init__(self, feature, N): super(Delta, self).__init__(feature, N, "delta") - def _load_internal(self, instrument, start_index, end_index, freq): - series = self.feature.load(instrument, start_index, end_index, freq) + def _load_internal(self, instrument, start_index, end_index, *args): + series = self.feature.load(instrument, start_index, end_index, *args) if self.N == 0: series = series - series.iloc[0] else: @@ -1223,8 +1223,8 @@ class Slope(Rolling): def __init__(self, feature, N): super(Slope, self).__init__(feature, N, "slope") - def _load_internal(self, instrument, start_index, end_index, freq): - series = self.feature.load(instrument, start_index, end_index, freq) + def _load_internal(self, instrument, start_index, end_index, *args): + series = self.feature.load(instrument, start_index, end_index, *args) if self.N == 0: series = pd.Series(expanding_slope(series.values), index=series.index) else: @@ -1251,8 +1251,8 @@ class Rsquare(Rolling): def __init__(self, feature, N): super(Rsquare, self).__init__(feature, N, "rsquare") - def _load_internal(self, instrument, start_index, end_index, freq): - _series = self.feature.load(instrument, start_index, end_index, freq) + def _load_internal(self, instrument, start_index, end_index, *args): + _series = self.feature.load(instrument, start_index, end_index, *args) if self.N == 0: series = pd.Series(expanding_rsquare(_series.values), index=_series.index) else: @@ -1280,8 +1280,8 @@ class Resi(Rolling): def __init__(self, feature, N): super(Resi, self).__init__(feature, N, "resi") - def _load_internal(self, instrument, start_index, end_index, freq): - series = self.feature.load(instrument, start_index, end_index, freq) + def _load_internal(self, instrument, start_index, end_index, *args): + series = self.feature.load(instrument, start_index, end_index, *args) if self.N == 0: series = pd.Series(expanding_resi(series.values), index=series.index) else: @@ -1308,8 +1308,8 @@ class WMA(Rolling): def __init__(self, feature, N): super(WMA, self).__init__(feature, N, "wma") - def _load_internal(self, instrument, start_index, end_index, freq): - series = self.feature.load(instrument, start_index, end_index, freq) + def _load_internal(self, instrument, start_index, end_index, *args): + series = self.feature.load(instrument, start_index, end_index, *args) # TODO: implement in Cython def weighted_mean(x): @@ -1343,8 +1343,8 @@ class EMA(Rolling): def __init__(self, feature, N): super(EMA, self).__init__(feature, N, "ema") - def _load_internal(self, instrument, start_index, end_index, freq): - series = self.feature.load(instrument, start_index, end_index, freq) + def _load_internal(self, instrument, start_index, end_index, *args): + series = self.feature.load(instrument, start_index, end_index, *args) def exp_weighted_mean(x): a = 1 - 2 / (1 + len(x)) @@ -1390,17 +1390,17 @@ def __init__(self, feature_left, feature_right, N, func): def __str__(self): return "{}({},{},{})".format(type(self).__name__, self.feature_left, self.feature_right, self.N) - def _load_internal(self, instrument, start_index, end_index, freq): + def _load_internal(self, instrument, start_index, end_index, *args): assert any( [isinstance(self.feature_left, Expression), self.feature_right, Expression] ), "at least one of two inputs is Expression instance" if isinstance(self.feature_left, Expression): - series_left = self.feature_left.load(instrument, start_index, end_index, freq) + series_left = self.feature_left.load(instrument, start_index, end_index, *args) else: series_left = self.feature_left # numeric value if isinstance(self.feature_right, Expression): - series_right = self.feature_right.load(instrument, start_index, end_index, freq) + series_right = self.feature_right.load(instrument, start_index, end_index, *args) else: series_right = self.feature_right @@ -1463,12 +1463,12 @@ class Corr(PairRolling): def __init__(self, feature_left, feature_right, N): super(Corr, self).__init__(feature_left, feature_right, N, "corr") - def _load_internal(self, instrument, start_index, end_index, freq): - res: pd.Series = super(Corr, self)._load_internal(instrument, start_index, end_index, freq) + def _load_internal(self, instrument, start_index, end_index, *args): + res: pd.Series = super(Corr, self)._load_internal(instrument, start_index, end_index, *args) # NOTE: Load uses MemCache, so calling load again will not cause performance degradation - series_left = self.feature_left.load(instrument, start_index, end_index, freq) - series_right = self.feature_right.load(instrument, start_index, end_index, freq) + series_left = self.feature_left.load(instrument, start_index, end_index, *args) + series_right = self.feature_right.load(instrument, start_index, end_index, *args) res.loc[ np.isclose(series_left.rolling(self.N, min_periods=1).std(), 0, atol=2e-05) | np.isclose(series_right.rolling(self.N, min_periods=1).std(), 0, atol=2e-05) @@ -1527,8 +1527,8 @@ def __init__(self, feature, freq, func): def __str__(self): return "{}({},{})".format(type(self).__name__, self.feature, self.freq) - def _load_internal(self, instrument, start_index, end_index, freq): - series = self.feature.load(instrument, start_index, end_index, freq) + def _load_internal(self, instrument, start_index, end_index, *args): + series = self.feature.load(instrument, start_index, end_index, *args) if series.empty: return series @@ -1646,8 +1646,10 @@ def register_all_ops(C): """register all operator""" logger = get_module_logger("ops") + from qlib.data.pit import P + # Operators.reset() - Operators.register(OpsList) + Operators.register(OpsList + [P]) # FIXME: I don't think it is necessary from .ops_period import PeriodOpsList diff --git a/qlib/data/pit.py b/qlib/data/pit.py new file mode 100644 index 0000000000..974137f8c4 --- /dev/null +++ b/qlib/data/pit.py @@ -0,0 +1,60 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +""" +Qlib follow the logic below to supporting point-in-time database + +For each stock, the format of its data is . Expression Engine support calculation on such format of data + +To calculate the feature value f_t at a specific observe time t, data with format will be used. +For example, the average earning of last 4 quarters (period_time) on 20190719 (observe_time) + +The calculation of both and data rely on expression engine. It consists of 2 phases. +1) calculation at each observation time t and it will collasped into a point (just like a normal feature) +2) concatenate all th collasped data, we will get data with format . +Qlib will use the operator `P` to perform the collapse. +""" +import numpy as np +import pandas as pd +from qlib.data.ops import ElemOperator +from qlib.log import get_module_logger + + +class P(ElemOperator): + def _load_internal(self, instrument, start_index, end_index, freq): + from .data import Cal + + _calendar = Cal.calendar(freq=freq) + resample_data = np.empty(end_index - start_index + 1, dtype="float32") + + for cur_index in range(start_index, end_index + 1): + # YXDEBUG: + # if cur_index == end_index - 3: + # __import__('ipdb').set_trace() + cur_time = _calendar[cur_index] + # To load expression accurately, more historical data are required + start_ws, end_ws = self.feature.get_extended_window_size() + if end_ws > 0: + raise ValueError( + "PIT database does not support referring to future period (e.g. expressions like `Ref('$$roewa_q', -1)` are not supported" + ) + + # The calculated value will always the last element, so the end_offset is zero. + try: + s = self.feature.load(instrument, -start_ws, 0, cur_time) + resample_data[cur_index - start_index] = s.iloc[-1] if len(s) > 0 else np.nan + except FileNotFoundError: + get_module_logger("base").warning(f"WARN: period data not found for {str(self)}") + return pd.Series(dtype="float32", name=str(self)) + + resample_series = pd.Series( + resample_data, index=pd.RangeIndex(start_index, end_index + 1), dtype="float32", name=str(self) + ) + return resample_series + + def get_longest_back_rolling(self): + # The period data will collapse as a normal feature. So no extending and looking back + return 0 + + def get_extended_window_size(self): + # The period data will collapse as a normal feature. So no extending and looking back + return 0, 0 diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 06956b1446..b7cdc79e07 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -101,11 +101,21 @@ def get_period_offset(first_year, period, quarterly): return offset -def read_period_data(index_path, data_path, period, cur_date, quarterly, last_period_index): +def read_period_data(index_path, data_path, period, cur_date_int: int, quarterly, last_period_index: int = None): """ At `cur_date`(e.g. 20190102), read the information at `period`(e.g. 201803). Only the updating info before cur_date or at cur_date will be used. + Parameters + ---------- + period: int + date period represented by interger, e.g. 201901 corresponds to the first quarter in 2019 + cur_date_int: int + date which represented by interger, e.g. 20190102 + last_period_index: int + it is a optional parameter; it is designed to avoid repeatedly access the .index data of PIT database when + sequentially observing the data (Because the latest index of a specific period of data certainly appear in after the one in last observation). + Returns ------- the query value and byte index the index value @@ -143,7 +153,7 @@ def read_period_data(index_path, data_path, period, cur_date, quarterly, last_pe while _next != NAN_INDEX: fd.seek(_next) date, period, value, new_next = struct.unpack(DATA_DTYPE, fd.read(struct.calcsize(DATA_DTYPE))) - if date > cur_date: + if date > cur_date_int: break prev_next = _next _next = new_next diff --git a/scripts/data_collector/pit/test_pit.py b/scripts/data_collector/pit/test_pit.py index befc1acc13..11ca1c7443 100644 --- a/scripts/data_collector/pit/test_pit.py +++ b/scripts/data_collector/pit/test_pit.py @@ -21,59 +21,163 @@ def to_str(self, obj): def check_same(self, a, b): self.assertEqual(self.to_str(a), self.to_str(b)) - def test_index_data(self): + def test_query(self): instruments = ["sh600519"] - fields = ["$$roewa_q", "$$yoyni_q"] + fields = ["P($$roewa_q)", "P($$yoyni_q)"] # Mao Tai published 2019Q2 report at 2019-07-13 & 2019-07-18 # - http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index data = D.features(instruments, fields, start_time="2019-01-01", end_time="20190719", freq="day") + print(data) + res = """ - $$roewa_q $$yoyni_q - count 133.000000 133.000000 - mean 0.196412 0.277930 - std 0.097591 0.030262 - min 0.000000 0.243892 - 25% 0.094737 0.243892 - 50% 0.255220 0.304181 - 75% 0.255220 0.305041 - max 0.344644 0.305041 + P($$roewa_q) P($$yoyni_q) + count 133.000000 133.000000 + mean 0.196412 0.277930 + std 0.097591 0.030262 + min 0.000000 0.243892 + 25% 0.094737 0.243892 + 50% 0.255220 0.304181 + 75% 0.255220 0.305041 + max 0.344644 0.305041 """ self.check_same(data.describe(), res) res = """ - $$roewa_q $$yoyni_q + P($$roewa_q) P($$yoyni_q) instrument datetime - sh600519 2019-07-15 0.000000 0.305041 - 2019-07-16 0.000000 0.305041 - 2019-07-17 0.000000 0.305041 - 2019-07-18 0.175322 0.252650 - 2019-07-19 0.175322 0.252650 + sh600519 2019-07-15 0.000000 0.305041 + 2019-07-16 0.000000 0.305041 + 2019-07-17 0.000000 0.305041 + 2019-07-18 0.175322 0.252650 + 2019-07-19 0.175322 0.252650 """ self.check_same(data.tail(), res) def test_no_exist_data(self): - fields = ["$$roewa_q", "$$yoyni_q", "$close"] - data = D.features(["sh600519", "sz000858"], fields, start_time="2019-01-01", end_time="20190719", freq="day") + fields = ["P($$roewa_q)", "P($$yoyni_q)", "$close"] + data = D.features(["sh600519", "sh601988"], fields, start_time="2019-01-01", end_time="20190719", freq="day") + data["$close"] = 1 # in case of different dataset gives different values + print(data) expect = """ - $$roewa_q $$yoyni_q $close + P($$roewa_q) P($$yoyni_q) $close instrument datetime - sh600519 2019-01-02 0.25522 0.243892 124.290070 - 2019-01-03 0.25522 0.243892 122.426697 - 2019-01-04 0.25522 0.243892 124.916748 - 2019-01-07 0.25522 0.243892 125.640930 - 2019-01-08 0.25522 0.243892 125.495667 - ... ... ... ... - sz000858 2019-07-15 NaN NaN 43.153912 - 2019-07-16 NaN NaN 42.632988 - 2019-07-17 NaN NaN 42.639885 - 2019-07-18 NaN NaN 41.742931 - 2019-07-19 NaN NaN 42.136211 + sh600519 2019-01-02 0.25522 0.243892 1 + 2019-01-03 0.25522 0.243892 1 + 2019-01-04 0.25522 0.243892 1 + 2019-01-07 0.25522 0.243892 1 + 2019-01-08 0.25522 0.243892 1 + ... ... ... ... + sh601988 2019-07-15 NaN NaN 1 + 2019-07-16 NaN NaN 1 + 2019-07-17 NaN NaN 1 + 2019-07-18 NaN NaN 1 + 2019-07-19 NaN NaN 1 [266 rows x 3 columns] """ self.check_same(data, expect) + def test_expr(self): + fields = [ + "P(Mean($$roewa_q, 1))", + "P($$roewa_q)", + "P(Mean($$roewa_q, 2))", + "P(Ref($$roewa_q, 1))", + "P((Ref($$roewa_q, 1) +$$roewa_q) / 2)", + ] + instruments = ["sh600519"] + data = D.features(instruments, fields, start_time="2019-01-01", end_time="20190719", freq="day") + expect = """ + P(Mean($$roewa_q, 1)) P($$roewa_q) P(Mean($$roewa_q, 2)) P(Ref($$roewa_q, 1)) P((Ref($$roewa_q, 1) +$$roewa_q) / 2) + instrument datetime + sh600519 2019-07-01 0.094737 0.094737 0.219691 0.344644 0.219691 + 2019-07-02 0.094737 0.094737 0.219691 0.344644 0.219691 + 2019-07-03 0.094737 0.094737 0.219691 0.344644 0.219691 + 2019-07-04 0.094737 0.094737 0.219691 0.344644 0.219691 + 2019-07-05 0.094737 0.094737 0.219691 0.344644 0.219691 + 2019-07-08 0.094737 0.094737 0.219691 0.344644 0.219691 + 2019-07-09 0.094737 0.094737 0.219691 0.344644 0.219691 + 2019-07-10 0.094737 0.094737 0.219691 0.344644 0.219691 + 2019-07-11 0.094737 0.094737 0.219691 0.344644 0.219691 + 2019-07-12 0.094737 0.094737 0.219691 0.344644 0.219691 + 2019-07-15 0.000000 0.000000 0.047369 0.094737 0.047369 + 2019-07-16 0.000000 0.000000 0.047369 0.094737 0.047369 + 2019-07-17 0.000000 0.000000 0.047369 0.094737 0.047369 + 2019-07-18 0.175322 0.175322 0.135029 0.094737 0.135029 + 2019-07-19 0.175322 0.175322 0.135029 0.094737 0.135029 + """ + self.check_same(data.tail(15), expect) + + def test_unlimit(self): + # fields = ["P(Mean($$roewa_q, 1))", "P($$roewa_q)", "P(Mean($$roewa_q, 2))", "P(Ref($$roewa_q, 1))", "P((Ref($$roewa_q, 1) +$$roewa_q) / 2)"] + fields = ["P($$roewa_q)"] + instruments = ["sh600519"] + _ = D.features(instruments, fields, freq="day") # this should not raise error + data = D.features(instruments, fields, end_time="20200101", freq="day") # this should not raise error + s = data.iloc[:, 0] + # You can check the expected value based on the content in `docs/advanced/PIT.rst` + expect = """ + instrument datetime + sh600519 1999-11-10 NaN + 2007-04-30 0.090219 + 2007-08-17 0.139330 + 2007-10-23 0.245863 + 2008-03-03 0.347900 + 2008-03-13 0.395989 + 2008-04-22 0.100724 + 2008-08-28 0.249968 + 2008-10-27 0.334120 + 2009-03-25 0.390117 + 2009-04-21 0.102675 + 2009-08-07 0.230712 + 2009-10-26 0.300730 + 2010-04-02 0.335461 + 2010-04-26 0.083825 + 2010-08-12 0.200545 + 2010-10-29 0.260986 + 2011-03-21 0.307393 + 2011-04-25 0.097411 + 2011-08-31 0.248251 + 2011-10-18 0.318919 + 2012-03-23 0.403900 + 2012-04-11 0.403925 + 2012-04-26 0.112148 + 2012-08-10 0.264847 + 2012-10-26 0.370487 + 2013-03-29 0.450047 + 2013-04-18 0.099958 + 2013-09-02 0.210442 + 2013-10-16 0.304543 + 2014-03-25 0.394328 + 2014-04-25 0.083217 + 2014-08-29 0.164503 + 2014-10-30 0.234085 + 2015-04-21 0.078494 + 2015-08-28 0.137504 + 2015-10-26 0.201709 + 2016-03-24 0.264205 + 2016-04-21 0.073664 + 2016-08-29 0.136576 + 2016-10-31 0.188062 + 2017-04-17 0.244385 + 2017-04-25 0.080614 + 2017-07-28 0.151510 + 2017-10-26 0.254166 + 2018-03-28 0.329542 + 2018-05-02 0.088887 + 2018-08-02 0.170563 + 2018-10-29 0.255220 + 2019-03-29 0.344644 + 2019-04-25 0.094737 + 2019-07-15 0.000000 + 2019-07-18 0.175322 + 2019-10-16 0.255819 + Name: P($$roewa_q), dtype: float32 + """ + + self.check_same(s[~s.duplicated().values], expect) + if __name__ == "__main__": unittest.main() diff --git a/scripts/dump_pit.py b/scripts/dump_pit.py index 9bff2f0f2f..cda872c09f 100644 --- a/scripts/dump_pit.py +++ b/scripts/dump_pit.py @@ -1,5 +1,10 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +""" +TODO: +- A more well-designed PIT database is required. + - seperated insert, delete, update, query operations are required. +""" import abc import shutil From 69cf2ab47ffbdac8a42b9da273ca4fc380a1cd20 Mon Sep 17 00:00:00 2001 From: Young Date: Thu, 10 Mar 2022 12:18:09 +0800 Subject: [PATCH 28/30] remove useless PIT code --- docs/advanced/PIT.rst | 1 + qlib/data/base.py | 311 ------------ qlib/data/ops.py | 44 +- qlib/data/ops_period.py | 660 ------------------------- qlib/data/pit.py | 3 - scripts/data_collector/pit/test_pit.py | 13 +- tests/notest_PIT.py | 30 -- 7 files changed, 31 insertions(+), 1031 deletions(-) delete mode 100644 qlib/data/ops_period.py delete mode 100644 tests/notest_PIT.py diff --git a/docs/advanced/PIT.rst b/docs/advanced/PIT.rst index 12cfd5397f..728c98b442 100644 --- a/docs/advanced/PIT.rst +++ b/docs/advanced/PIT.rst @@ -130,3 +130,4 @@ The statements are soted by the `date` in ascending order from the beginning of Known limitations - Currently, the PIT database is designed for quarterly or annually factors, which can handle fundamental data of financial reports in most markets. Qlib leverage the file name to identify the type of the data. File with name like `XXX_q.data` corresponds to quarterly data. File with name like `XXX_a.data` corresponds to annual data +- The caclulation of PIT is not performed in the optimal way. There is great potential to boost the performance of PIT data calcuation. diff --git a/qlib/data/base.py b/qlib/data/base.py index 75557db7f1..e93938f610 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -272,314 +272,3 @@ class ExpressionOps(Expression): """ pass - - -class PExpression(abc.ABC): - """PExpression base class""" - - def __str__(self): - return type(self).__name__ - - def __repr__(self): - return str(self) - - def __gt__(self, other): - if isinstance(other, Expression): - from .ops import Gt - - return Gt(self, other) - else: - from .ops_period import PGt - - return PGt(self, other) - - def __ge__(self, other): - if isinstance(other, Expression): - from .ops import Ge - - return Ge(self, other) - else: - from .ops_period import PGe - - return PGe(self, other) - - def __lt__(self, other): - if isinstance(other, Expression): - from .ops import Lt - - return Lt(self, other) - else: - from .ops_period import PLt - - return PLt(self, other) - - def __le__(self, other): - if isinstance(other, Expression): - from .ops import Le - - return Le(self, other) - else: - from .ops_period import PLe - - return PLe(self, other) - - def __eq__(self, other): - if isinstance(other, Expression): - from .ops import Eq - - return Eq(self, other) - else: - from .ops_period import PEq - - return PEq(self, other) - - def __ne__(self, other): - if isinstance(other, Expression): - from .ops import Ne - - return Ne(self, other) - else: - from .ops_period import PNe - - return PNe(self, other) - - def __add__(self, other): - if isinstance(other, Expression): - from .ops import Add - - return Add(self, other) - else: - from .ops_period import PAdd - - return PAdd(self, other) - - def __radd__(self, other): - if isinstance(other, Expression): - from .ops import Add - - return Add(other, self) - else: - from .ops_period import PAdd - - return PAdd(other, self) - - def __sub__(self, other): - if isinstance(other, Expression): - from .ops import Sub - - return Sub(self, other) - else: - from .ops_period import PSub - - return PSub(self, other) - - def __rsub__(self, other): - if isinstance(other, Expression): - from .ops import Sub - - return Sub(other, self) - else: - from .ops_period import PSub - - return PSub(other, self) - - def __mul__(self, other): - if isinstance(other, Expression): - from .ops import Mul - - return Mul(self, other) - else: - from .ops_period import PMul - - return PMul(self, other) - - def __rmul__(self, other): - if isinstance(other, Expression): - from .ops import Mul - - return Mul(other, self) - else: - from .ops_period import PMul - - return PMul(other, self) - - def __div__(self, other): - if isinstance(other, Expression): - from .ops import Div - - return Div(self, other) - else: - from .ops_period import PDiv - - return PDiv(self, other) - - def __rdiv__(self, other): - if isinstance(other, Expression): - from .ops import Div - - return Div(other, self) - else: - from .ops_period import PDiv - - return PDiv(other, self) - - def __truediv__(self, other): - if isinstance(other, Expression): - from .ops import Div - - return Div(self, other) - else: - from .ops_period import PDiv - - return PDiv(self, other) - - def __rtruediv__(self, other): - if isinstance(other, Expression): - from .ops import Div - - return Div(other, self) - else: - from .ops_period import PDiv - - return PDiv(other, self) - - def __pow__(self, other): - if isinstance(other, Expression): - from .ops import Power - - return Power(self, other) - else: - from .ops_period import PPower - - return PPower(self, other) - - def __and__(self, other): - if isinstance(other, Expression): - from .ops import And - - return And(self, other) - else: - from .ops_period import PAnd - - return PAnd(self, other) - - def __rand__(self, other): - if isinstance(other, Expression): - from .ops import And - - return And(other, self) - else: - from .ops_period import PAnd - - return PAnd(other, self) - - def __or__(self, other): - if isinstance(other, Expression): - from .ops import Or - - return Or(self, other) - else: - from .ops_period import POr - - return POr(self, other) - - def __ror__(self, other): - if isinstance(other, Expression): - from .ops import Or - - return Or(other, self) - else: - from .ops_period import POr - - return POr(other, self) - - @abc.abstractmethod - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - raise NotImplementedError("This function must be implemented in your newly defined feature") - - @abc.abstractmethod - def get_period_offset(self, cur_index): - raise NotImplementedError("This function must be implemented in your newly defined feature") - - def check_feature_exist(self, instrument): - child_exist_list = [ - v.check_feature_exist(instrument) for k, v in self.__dict__.items() if isinstance(v, PExpression) - ] - return all(child_exist_list) - - def load(self, instrument, start_index, end_index, freq): - - if not self.check_feature_exist(instrument): - get_module_logger("base").warning(f"WARN: period data not found for {str(self)}") - return pd.Series(dtype="float32", name=str(self)) - - from .cache import H - - # cache - args = str(self), instrument, start_index, end_index, freq - if args in H["f"]: - return H["f"][args] - if start_index is None or end_index is None or start_index > end_index: - raise ValueError("Invalid index range: {} {}".format(start_index, end_index)) - - from .data import Cal - - _calendar = Cal.calendar(freq=freq) - resample_data = np.empty(end_index - start_index + 1, dtype="float32") - - for cur_index in range(start_index, end_index + 1): - cur_time = _calendar[cur_index] - # To load expression accurately, more historical data are required - start_offset = self.get_period_offset(cur_index) - # The calculated value will always the last element, so the end_offset is zero. - try: - resample_data[cur_index - start_index] = self.load_period_data( - instrument, start_offset, 0, cur_time, info=(start_index, end_index, cur_index) - ).iloc[-1] - except FileNotFoundError: - get_module_logger("base").warning(f"WARN: period data not found for {str(self)}") - return pd.Series(dtype="float32", name=str(self)) - - resample_series = pd.Series( - resample_data, index=pd.RangeIndex(start_index, end_index + 1), dtype="float32", name=str(self) - ) - H["f"][args] = resample_series - return resample_series - - def get_longest_back_rolling(self): - return 0 - - def get_extended_window_size(self): - return 0, 0 - - -# class PFeature(PExpression): -# def __init__(self, name=None): -# if name: -# self._name = name.lower() -# else: -# self._name = type(self).__name__.lower() -# -# def __str__(self): -# return "$$" + self._name -# -# def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): -# # BUG: cur_idnex is a date!!!!! -# ### Zhou Code -# from .data import PITD -# -# return PITD.period_feature(instrument, str(self), start_offset, end_offset, cur_index, **kwargs) -# # return pd.Series([1, 2, 3]) # fot test -# -# def get_period_offset(self, cur_index): -# return 0 - - -class PExpressionOps(PExpression): - """Operator Expression - - This kind of feature will use operator for feature - construction on the fly. - """ - - pass diff --git a/qlib/data/ops.py b/qlib/data/ops.py index d3a5baffdd..fd7eb94bb6 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -10,7 +10,7 @@ from typing import Union, List, Type from scipy.stats import percentileofscore -from .base import Expression, ExpressionOps, Feature, PExpression, PFeature +from .base import Expression, ExpressionOps, Feature, PFeature from ..log import get_module_logger from ..utils import get_callable_kwargs @@ -250,24 +250,24 @@ def __str__(self): return "{}({},{})".format(type(self).__name__, self.feature_left, self.feature_right) def get_longest_back_rolling(self): - if isinstance(self.feature_left, (Expression, PExpression)): + if isinstance(self.feature_left, (Expression,)): left_br = self.feature_left.get_longest_back_rolling() else: left_br = 0 - if isinstance(self.feature_right, (Expression, PExpression)): + if isinstance(self.feature_right, (Expression,)): right_br = self.feature_right.get_longest_back_rolling() else: right_br = 0 return max(left_br, right_br) def get_extended_window_size(self): - if isinstance(self.feature_left, (Expression, PExpression)): + if isinstance(self.feature_left, (Expression,)): ll, lr = self.feature_left.get_extended_window_size() else: ll, lr = 0, 0 - if isinstance(self.feature_right, (Expression, PExpression)): + if isinstance(self.feature_right, (Expression,)): rl, rr = self.feature_right.get_extended_window_size() else: rl, rr = 0, 0 @@ -298,13 +298,13 @@ def __init__(self, feature_left, feature_right, func): def _load_internal(self, instrument, start_index, end_index, *args): assert any( - [isinstance(self.feature_left, (Expression, PExpression)), self.feature_right, Expression] + [isinstance(self.feature_left, (Expression,)), self.feature_right, Expression] ), "at least one of two inputs is Expression instance" - if isinstance(self.feature_left, (Expression, PExpression)): + if isinstance(self.feature_left, (Expression,)): series_left = self.feature_left.load(instrument, start_index, end_index, *args) else: series_left = self.feature_left # numeric value - if isinstance(self.feature_right, (Expression, PExpression)): + if isinstance(self.feature_right, (Expression,)): series_right = self.feature_right.load(instrument, start_index, end_index, *args) else: series_right = self.feature_right @@ -637,11 +637,11 @@ def __str__(self): def _load_internal(self, instrument, start_index, end_index, *args): series_cond = self.condition.load(instrument, start_index, end_index, *args) - if isinstance(self.feature_left, (Expression, PExpression)): + if isinstance(self.feature_left, (Expression,)): series_left = self.feature_left.load(instrument, start_index, end_index, *args) else: series_left = self.feature_left - if isinstance(self.feature_right, (Expression, PExpression)): + if isinstance(self.feature_right, (Expression,)): series_right = self.feature_right.load(instrument, start_index, end_index, *args) else: series_right = self.feature_right @@ -649,34 +649,34 @@ def _load_internal(self, instrument, start_index, end_index, *args): return series def get_longest_back_rolling(self): - if isinstance(self.feature_left, (Expression, PExpression)): + if isinstance(self.feature_left, (Expression,)): left_br = self.feature_left.get_longest_back_rolling() else: left_br = 0 - if isinstance(self.feature_right, (Expression, PExpression)): + if isinstance(self.feature_right, (Expression,)): right_br = self.feature_right.get_longest_back_rolling() else: right_br = 0 - if isinstance(self.condition, (Expression, PExpression)): + if isinstance(self.condition, (Expression,)): c_br = self.condition.get_longest_back_rolling() else: c_br = 0 return max(left_br, right_br, c_br) def get_extended_window_size(self): - if isinstance(self.feature_left, (Expression, PExpression)): + if isinstance(self.feature_left, (Expression,)): ll, lr = self.feature_left.get_extended_window_size() else: ll, lr = 0, 0 - if isinstance(self.feature_right, (Expression, PExpression)): + if isinstance(self.feature_right, (Expression,)): rl, rr = self.feature_right.get_extended_window_size() else: rl, rr = 0, 0 - if isinstance(self.condition, (Expression, PExpression)): + if isinstance(self.condition, (Expression,)): cl, cr = self.condition.get_extended_window_size() else: cl, cr = 0, 0 @@ -1621,10 +1621,7 @@ def register(self, ops_list: List[Union[Type[ExpressionOps], dict]]): else: _ops_class = _operator - # FIXME: remove PExpression - from .ops_period import PExpression - - if not issubclass(_ops_class, (Expression, PExpression)): + if not issubclass(_ops_class, (Expression,)): raise TypeError("operator must be subclass of ExpressionOps, not {}".format(_ops_class)) if _ops_class.__name__ in self._ops: @@ -1648,14 +1645,9 @@ def register_all_ops(C): from qlib.data.pit import P - # Operators.reset() + Operators.reset() Operators.register(OpsList + [P]) - # FIXME: I don't think it is necessary - from .ops_period import PeriodOpsList - - Operators.register(PeriodOpsList) - if getattr(C, "custom_ops", None) is not None: Operators.register(C.custom_ops) logger.debug("register custom operator {}".format(C.custom_ops)) diff --git a/qlib/data/ops_period.py b/qlib/data/ops_period.py deleted file mode 100644 index b68a742f89..0000000000 --- a/qlib/data/ops_period.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - - -from __future__ import division -from __future__ import print_function - -import sys -import abc -import numpy as np -import pandas as pd - -from scipy.stats import percentileofscore - -from .base import PExpression, PExpressionOps -from ..log import get_module_logger - -try: - from ._libs.rolling import rolling_slope, rolling_rsquare, rolling_resi - from ._libs.expanding import expanding_slope, expanding_rsquare, expanding_resi -except ImportError: - print( - "#### Do not import qlib package in the repository directory in case of importing qlib from . without compiling #####" - ) - raise - - -np.seterr(invalid="ignore") - -#################### Element-Wise Operator #################### - - -class PElemOperator(PExpressionOps): - def __init__(self, feature): - self.feature = feature - - def __str__(self): - return "{}({})".format(type(self).__name__, self.feature) - - def get_period_offset(self, cur_index): - return self.feature.get_period_offset(cur_index) - - -class PNpElemOperator(PElemOperator): - def __init__(self, feature, func): - self.func = func - super(PNpElemOperator, self).__init__(feature) - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - return getattr(np, self.func)(series) - - -class PAbs(PNpElemOperator): - def __init__(self, feature): - super(PAbs, self).__init__(feature, "abs") - - -class PSign(PNpElemOperator): - def __init__(self, feature): - super(PSign, self).__init__(feature, "sign") - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - """ - To avoid error raised by bool type input, we transform the data into float32. - """ - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - # TODO: More precision types should be configurable - series = series.astype(np.float32) - return getattr(np, self.func)(series) - - -class PLog(PNpElemOperator): - def __init__(self, feature): - super(PLog, self).__init__(feature, "log") - - -class PPower(PNpElemOperator): - def __init__(self, feature, exponent): - super(PPower, self).__init__(feature, "power") - self.exponent = exponent - - def __str__(self): - return "{}({},{})".format(type(self).__name__, self.feature, self.exponent) - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - return getattr(np, self.func)(series, self.exponent) - - -class PMask(PNpElemOperator): - def __init__(self, feature, instrument): - super(PMask, self).__init__(feature, "mask") - self.instrument = instrument - - def __str__(self): - return "{}({},{})".format(type(self).__name__, self.feature, self.instrument.lower()) - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - - return self.feature.load_period_data(self.instrument, start_offset, end_offset, cur_index) - - -class PNot(PNpElemOperator): - def __init__(self, feature): - super(PNot, self).__init__(feature, "bitwise_not") - - -#################### Pair-Wise Operator #################### -class PPairOperator(PExpressionOps): - def __init__(self, feature_left, feature_right): - self.feature_left = feature_left - self.feature_right = feature_right - - def __str__(self): - return "{}({},{})".format(type(self).__name__, self.feature_left, self.feature_right) - - def get_period_offset(self, cur_index): - if isinstance(self.feature_left, PExpression): - left_br = self.feature_left.get_period_offset(cur_index) - else: - left_br = 0 - - if isinstance(self.feature_right, PExpression): - right_br = self.feature_right.get_period_offset(cur_index) - else: - right_br = 0 - return max(left_br, right_br) - - -class PNpPairOperator(PPairOperator): - def __init__(self, feature_left, feature_right, func): - self.feature_left = feature_left - self.feature_right = feature_right - self.func = func - super(PNpPairOperator, self).__init__(feature_left, feature_right) - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - assert any( - [isinstance(self.feature_left, PExpression), self.feature_right, PExpression] - ), "at least one of two inputs is PExpression instance" - if isinstance(self.feature_left, PExpression): - series_left = self.feature_left.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - else: - series_left = self.feature_left # numeric value - if isinstance(self.feature_right, PExpression): - series_right = self.feature_right.load_period_data( - instrument, start_offset, end_offset, cur_index, **kwargs - ) - else: - series_right = self.feature_right - return getattr(np, self.func)(series_left, series_right) - - -class PAdd(PNpPairOperator): - def __init__(self, feature_left, feature_right): - super(PAdd, self).__init__(feature_left, feature_right, "add") - - -class PSub(PNpPairOperator): - def __init__(self, feature_left, feature_right): - super(PSub, self).__init__(feature_left, feature_right, "subtract") - - -class PMul(PNpPairOperator): - def __init__(self, feature_left, feature_right): - super(PMul, self).__init__(feature_left, feature_right, "multiply") - - -class PDiv(PNpPairOperator): - def __init__(self, feature_left, feature_right): - super(PDiv, self).__init__(feature_left, feature_right, "divide") - - -class PGreater(PNpPairOperator): - def __init__(self, feature_left, feature_right): - super(PGreater, self).__init__(feature_left, feature_right, "maximum") - - -class PLess(PNpPairOperator): - def __init__(self, feature_left, feature_right): - super(PLess, self).__init__(feature_left, feature_right, "minimum") - - -class PGt(PNpPairOperator): - def __init__(self, feature_left, feature_right): - super(PGt, self).__init__(feature_left, feature_right, "greater") - - -class PGe(PNpPairOperator): - def __init__(self, feature_left, feature_right): - super(PGe, self).__init__(feature_left, feature_right, "greater_equal") - - -class PLt(PNpPairOperator): - def __init__(self, feature_left, feature_right): - super(PLt, self).__init__(feature_left, feature_right, "less") - - -class PLe(PNpPairOperator): - def __init__(self, feature_left, feature_right): - super(PLe, self).__init__(feature_left, feature_right, "less_equal") - - -class PEq(PNpPairOperator): - def __init__(self, feature_left, feature_right): - super(PEq, self).__init__(feature_left, feature_right, "equal") - - -class PNe(PNpPairOperator): - def __init__(self, feature_left, feature_right): - super(PNe, self).__init__(feature_left, feature_right, "not_equal") - - -class PAnd(PNpPairOperator): - def __init__(self, feature_left, feature_right): - super(PAnd, self).__init__(feature_left, feature_right, "bitwise_and") - - -class POr(PNpPairOperator): - def __init__(self, feature_left, feature_right): - super(POr, self).__init__(feature_left, feature_right, "bitwise_or") - - -#################### Triple-wise Operator #################### -class PIf(PExpressionOps): - def __init__(self, condition, feature_left, feature_right): - self.condition = condition - self.feature_left = feature_left - self.feature_right = feature_right - - def __str__(self): - return "PIf({},{},{})".format(self.condition, self.feature_left, self.feature_right) - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - series_cond = self.condition.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - if isinstance(self.feature_left, PExpression): - series_left = self.feature_left.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - else: - series_left = self.feature_left - if isinstance(self.feature_right, PExpression): - series_right = self.feature_right.load_period_data( - instrument, start_offset, end_offset, cur_index, **kwargs - ) - else: - series_right = self.feature_right - series = pd.Series(np.where(series_cond, series_left, series_right), index=series_cond.index) - return series - - def get_period_offset(self, cur_index): - if isinstance(self.feature_left, PExpression): - left_br = self.feature_left.get_period_offset(cur_index) - else: - left_br = 0 - - if isinstance(self.feature_right, PExpression): - right_br = self.feature_right.get_period_offset(cur_index) - else: - right_br = 0 - - if isinstance(self.condition, PExpression): - c_br = self.condition.get_period_offset(cur_index) - else: - c_br = 0 - return max(left_br, right_br, c_br) - - -#################### PRolling #################### -# NOTE: methods like `rolling.mean` are optimized with cython, -# and are super faster than `rolling.apply(np.mean)` - - -class PRolling(PExpressionOps): - def __init__(self, feature, N, func): - self.feature = feature - self.N = N - self.func = func - - def __str__(self): - return "{}({},{})".format(type(self).__name__, self.feature, self.N) - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - # NOTE: remove all null check, - # now it's user's responsibility to decide whether use features in null days - # isnull = series.isnull() # NOTE: isnull = NaN, inf is not null - if self.N == 0: - series = getattr(series.expanding(min_periods=1), self.func)() - elif 0 < self.N < 1: - series = series.ewm(alpha=self.N, min_periods=1).mean() - else: - series = getattr(series.rolling(self.N, min_periods=1), self.func)() - # series.iloc[:self.N-1] = np.nan - # series[isnull] = np.nan - return series - - def get_period_offset(self, cur_index): - if self.N == 0: - return np.inf - if 0 < self.N < 1: - return int(np.log(1e-6) / np.log(1 - self.N)) # (1 - N)**window == 1e-6 - return self.feature.get_period_offset(cur_index) + self.N - 1 - - -class PRef(PRolling): - def __init__(self, feature, N): - super(PRef, self).__init__(feature, N, "ref") - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - # N = 0, return first day - if series.empty: - return series # Pandas bug, see: https://github.com/pandas-dev/pandas/issues/21049 - elif self.N == 0: - series = pd.Series(series.iloc[0], index=series.index) - else: - series = series.shift(self.N) # copy - return series - - def get_period_offset(self, cur_index): - if self.N == 0: - return np.inf - return self.feature.get_period_offset(cur_index) + self.N - - -class PMean(PRolling): - def __init__(self, feature, N): - super(PMean, self).__init__(feature, N, "mean") - - -class PSum(PRolling): - def __init__(self, feature, N): - super(PSum, self).__init__(feature, N, "sum") - - -class PStd(PRolling): - def __init__(self, feature, N): - super(PStd, self).__init__(feature, N, "std") - - -class PVar(PRolling): - def __init__(self, feature, N): - super(PVar, self).__init__(feature, N, "var") - - -class PSkew(PRolling): - def __init__(self, feature, N): - if N != 0 and N < 3: - raise ValueError("The rolling window size of Skewness operation should >= 3") - super(PSkew, self).__init__(feature, N, "skew") - - -class PKurt(PRolling): - def __init__(self, feature, N): - if N != 0 and N < 4: - raise ValueError("The rolling window size of Kurtosis operation should >= 5") - super(PKurt, self).__init__(feature, N, "kurt") - - -class PMax(PRolling): - def __init__(self, feature, N): - super(PMax, self).__init__(feature, N, "max") - - -class PIdxMax(PRolling): - def __init__(self, feature, N): - super(PIdxMax, self).__init__(feature, N, "idxmax") - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - if self.N == 0: - series = series.expanding(min_periods=1).apply(lambda x: x.argmax() + 1, raw=True) - else: - series = series.rolling(self.N, min_periods=1).apply(lambda x: x.argmax() + 1, raw=True) - return series - - -class PMin(PRolling): - def __init__(self, feature, N): - super(PMin, self).__init__(feature, N, "min") - - -class PIdxMin(PRolling): - def __init__(self, feature, N): - super(PIdxMin, self).__init__(feature, N, "idxmin") - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - if self.N == 0: - series = series.expanding(min_periods=1).apply(lambda x: x.argmin() + 1, raw=True) - else: - series = series.rolling(self.N, min_periods=1).apply(lambda x: x.argmin() + 1, raw=True) - return series - - -class PQuantile(PRolling): - def __init__(self, feature, N, qscore): - super(PQuantile, self).__init__(feature, N, "quantile") - self.qscore = qscore - - def __str__(self): - return "{}({},{},{})".format(type(self).__name__, self.feature, self.N, self.qscore) - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - if self.N == 0: - series = series.expanding(min_periods=1).quantile(self.qscore) - else: - series = series.rolling(self.N, min_periods=1).quantile(self.qscore) - return series - - -class PMed(PRolling): - def __init__(self, feature, N): - super(PMed, self).__init__(feature, N, "median") - - -class PMad(PRolling): - def __init__(self, feature, N): - super(PMad, self).__init__(feature, N, "mad") - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - # TODO: implement in Cython - - def mad(x): - x1 = x[~np.isnan(x)] - return np.mean(np.abs(x1 - x1.mean())) - - if self.N == 0: - series = series.expanding(min_periods=1).apply(mad, raw=True) - else: - series = series.rolling(self.N, min_periods=1).apply(mad, raw=True) - return series - - -class PRank(PRolling): - def __init__(self, feature, N): - super(PRank, self).__init__(feature, N, "rank") - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - # TODO: implement in Cython - - def rank(x): - if np.isnan(x[-1]): - return np.nan - x1 = x[~np.isnan(x)] - if x1.shape[0] == 0: - return np.nan - return percentileofscore(x1, x1[-1]) / len(x1) - - if self.N == 0: - series = series.expanding(min_periods=1).apply(rank, raw=True) - else: - series = series.rolling(self.N, min_periods=1).apply(rank, raw=True) - return series - - -class PCount(PRolling): - def __init__(self, feature, N): - super(PCount, self).__init__(feature, N, "count") - - -class PDelta(PRolling): - def __init__(self, feature, N): - super(PDelta, self).__init__(feature, N, "delta") - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - if self.N == 0: - series = series - series.iloc[0] - else: - series = series - series.shift(self.N) - return series - - -# TODO: -# support pair-wise rolling like `PSlope(A, B, N)` -class PSlope(PRolling): - def __init__(self, feature, N): - super(PSlope, self).__init__(feature, N, "slope") - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - if self.N == 0: - series = pd.Series(expanding_slope(series.values), index=series.index) - else: - series = pd.Series(rolling_slope(series.values, self.N), index=series.index) - return series - - -class PRsquare(PRolling): - def __init__(self, feature, N): - super(PRsquare, self).__init__(feature, N, "rsquare") - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - _series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - if self.N == 0: - series = pd.Series(expanding_rsquare(_series.values), index=_series.index) - else: - series = pd.Series(rolling_rsquare(_series.values, self.N), index=_series.index) - series.loc[np.isclose(_series.rolling(self.N, min_periods=1).std(), 0, atol=2e-05)] = np.nan - return series - - -class PResi(PRolling): - def __init__(self, feature, N): - super(PResi, self).__init__(feature, N, "resi") - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - if self.N == 0: - series = pd.Series(expanding_resi(series.values), index=series.index) - else: - series = pd.Series(rolling_resi(series.values, self.N), index=series.index) - return series - - -class PWMA(PRolling): - def __init__(self, feature, N): - super(PWMA, self).__init__(feature, N, "wma") - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - # TODO: implement in Cython - - def weighted_mean(x): - w = np.arange(len(x)) - w = w / w.sum() - return np.nanmean(w * x) - - if self.N == 0: - series = series.expanding(min_periods=1).apply(weighted_mean, raw=True) - else: - series = series.rolling(self.N, min_periods=1).apply(weighted_mean, raw=True) - return series - - -class PEMA(PRolling): - def __init__(self, feature, N): - super(PEMA, self).__init__(feature, N, "ema") - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - series = self.feature.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - - def exp_weighted_mean(x): - a = 1 - 2 / (1 + len(x)) - w = a ** np.arange(len(x))[::-1] - w /= w.sum() - return np.nansum(w * x) - - if self.N == 0: - series = series.expanding(min_periods=1).apply(exp_weighted_mean, raw=True) - elif 0 < self.N < 1: - series = series.ewm(alpha=self.N, min_periods=1).mean() - else: - series = series.ewm(span=self.N, min_periods=1).mean() - return series - - -#################### Pair-Wise PRolling #################### -class PairRolling(PExpressionOps): - def __init__(self, feature_left, feature_right, N, func): - self.feature_left = feature_left - self.feature_right = feature_right - self.N = N - self.func = func - - def __str__(self): - return "{}({},{},{})".format(type(self).__name__, self.feature_left, self.feature_right, self.N) - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - series_left = self.feature_left.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - series_right = self.feature_right.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - if self.N == 0: - series = getattr(series_left.expanding(min_periods=1), self.func)(series_right) - else: - series = getattr(series_left.rolling(self.N, min_periods=1), self.func)(series_right) - return series - - def get_period_offset(self, cur_index): - if self.N == 0: - return np.inf - return ( - max(self.feature_left.get_period_offset(cur_index), self.feature_right.get_period_offset(cur_index)) - + self.N - - 1 - ) - - -class PCorr(PairRolling): - def __init__(self, feature_left, feature_right, N): - super(PCorr, self).__init__(feature_left, feature_right, N, "corr") - - def load_period_data(self, instrument, start_offset, end_offset, cur_index, **kwargs): - res = super(PCorr, self)._load_internal(instrument, start_index, end_index, freq) - - # NOTE: Load uses MemCache, so calling load_period_data again will not cause performance degradation - series_left = self.feature_left.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - series_right = self.feature_right.load_period_data(instrument, start_offset, end_offset, cur_index, **kwargs) - res.loc[ - np.isclose(series_left.rolling(self.N, min_periods=1).std(), 0, atol=2e-05) - | np.isclose(series_right.rolling(self.N, min_periods=1).std(), 0, atol=2e-05) - ] = np.nan - return res - - -class PCov(PairRolling): - def __init__(self, feature_left, feature_right, N): - super(PCov, self).__init__(feature_left, feature_right, N, "cov") - - -PeriodOpsList = [ - PRef, - PMax, - PMin, - PSum, - PMean, - PStd, - PVar, - PSkew, - PKurt, - PMed, - PMad, - PSlope, - PRsquare, - PResi, - PRank, - PQuantile, - PCount, - PEMA, - PWMA, - PCorr, - PCov, - PDelta, - PAbs, - PSign, - PLog, - PPower, - PAdd, - PSub, - PMul, - PDiv, - PGreater, - PLess, - PAnd, - POr, - PNot, - PGt, - PGe, - PLt, - PLe, - PEq, - PNe, - PMask, - PIdxMax, - PIdxMin, - PIf, -] diff --git a/qlib/data/pit.py b/qlib/data/pit.py index 974137f8c4..2cf144c6ef 100644 --- a/qlib/data/pit.py +++ b/qlib/data/pit.py @@ -27,9 +27,6 @@ def _load_internal(self, instrument, start_index, end_index, freq): resample_data = np.empty(end_index - start_index + 1, dtype="float32") for cur_index in range(start_index, end_index + 1): - # YXDEBUG: - # if cur_index == end_index - 3: - # __import__('ipdb').set_trace() cur_time = _calendar[cur_index] # To load expression accurately, more historical data are required start_ws, end_ws = self.feature.get_extended_window_size() diff --git a/scripts/data_collector/pit/test_pit.py b/scripts/data_collector/pit/test_pit.py index 11ca1c7443..fa456670b0 100644 --- a/scripts/data_collector/pit/test_pit.py +++ b/scripts/data_collector/pit/test_pit.py @@ -13,7 +13,8 @@ class TestPIT(unittest.TestCase): """ def setUp(self): - qlib.init(kernels=1) # NOTE: set kernel to 1 to make it debug easier + # qlib.init(kernels=1) # NOTE: set kernel to 1 to make it debug easier + qlib.init() # NOTE: set kernel to 1 to make it debug easier def to_str(self, obj): return "".join(str(obj).split()) @@ -178,6 +179,16 @@ def test_unlimit(self): self.check_same(s[~s.duplicated().values], expect) + def test_expr2(self): + instruments = ["sh600519"] + fields = ["P($$roewa_q)", "P($$yoyni_q)"] + fields += ["P(($$roewa_q / $$yoyni_q) / Ref($$roewa_q / $$yoyni_q, 1) - 1)"] + fields += ["P(Sum($$yoyni_q, 4))"] + fields += ["$close", "P($$roewa_q) * $close"] + data = D.features(instruments, fields, start_time="2019-01-01", end_time="2020-01-01", freq="day") + print(data) + print(data.describe()) + if __name__ == "__main__": unittest.main() diff --git a/tests/notest_PIT.py b/tests/notest_PIT.py deleted file mode 100644 index cc1545e7f2..0000000000 --- a/tests/notest_PIT.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import sys -import unittest -import qlib -from qlib.data import D -from qlib.tests import TestAutoData - - -class TestRegiterCustomOps(TestAutoData): - @classmethod - def setUpClass(cls) -> None: - # use default data - qlib.init() - - def test_regiter_custom_ops(self): - - instruments = ["sh600519"] - fields = ["$$roewa_q", "$$yoyni_q"] - fields += ["($$roewa_q / $$yoyni_q) / PRef($$roewa_q / $$yoyni_q, 1) - 1"] - fields += ["PSum($$yoyni_q, 4)"] - fields += ["$close", "$$roewa_q*$close"] - data = D.features(instruments, fields, start_time="2019-01-01", end_time="2020-01-01", freq="day") - print(data) - print(data.describe()) - - -if __name__ == "__main__": - unittest.main() From de8d6cb123cd939f9c9258cc4a51b5abfb307375 Mon Sep 17 00:00:00 2001 From: Young Date: Thu, 10 Mar 2022 12:33:25 +0800 Subject: [PATCH 29/30] fix PYlint --- README.md | 7 ++++--- docs/index.rst | 1 + qlib/data/base.py | 6 ------ qlib/data/cache.py | 2 +- qlib/data/data.py | 2 -- qlib/data/ops.py | 2 +- qlib/data/pit.py | 2 +- 7 files changed, 8 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 28ff004b39..3e2faa94da 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,8 @@ Recent released features | Feature | Status | | -- | ------ | +| Point-in-Time database | :hammer: [Rleased](https://github.com/microsoft/qlib/pull/343) on Mar 10, 2022 | +| Arctic Provider Backend & Orderbook data example | :hammer: [Rleased](https://github.com/microsoft/qlib/pull/744) on Jan 17, 2022 | | Arctic Provider Backend & Orderbook data example | :hammer: [Rleased](https://github.com/microsoft/qlib/pull/744) on Jan 17, 2022 | | Meta-Learning-based framework & DDG-DA | :chart_with_upwards_trend: :hammer: [Released](https://github.com/microsoft/qlib/pull/743) on Jan 10, 2022 | | Planning-based portfolio optimization | :hammer: [Released](https://github.com/microsoft/qlib/pull/754) on Dec 28, 2021 | @@ -95,9 +97,8 @@ For more details, please refer to our paper ["Qlib: An AI-oriented Quantitative # Plans New features under development(order by estimated release time). Your feedbacks about the features are very important. -| Feature | Status | -| -- | ------ | -| Point-in-Time database | Under review: https://github.com/microsoft/qlib/pull/343 | + + # Framework of Qlib diff --git a/docs/index.rst b/docs/index.rst index 9409d2ac19..f55262c027 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -53,6 +53,7 @@ Document Structure Online & Offline mode Serialization Task Management + Point-In-Time database .. toctree:: :maxdepth: 3 diff --git a/qlib/data/base.py b/qlib/data/base.py index e93938f610..427c15e3ca 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -5,12 +5,8 @@ from __future__ import division from __future__ import print_function -import os import abc import pandas as pd -import numpy as np - -from ..utils import code_to_fname from ..log import get_module_logger @@ -270,5 +266,3 @@ class ExpressionOps(Expression): This kind of feature will use operator for feature construction on the fly. """ - - pass diff --git a/qlib/data/cache.py b/qlib/data/cache.py index 7d08e728df..fc6518de51 100644 --- a/qlib/data/cache.py +++ b/qlib/data/cache.py @@ -32,7 +32,7 @@ ) from ..log import get_module_logger -from .base import Feature, PFeature +from .base import Feature from .ops import Operators # pylint: disable=W0611 diff --git a/qlib/data/data.py b/qlib/data/data.py index aef27dbb25..cd8f7f77f6 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -6,7 +6,6 @@ from __future__ import print_function import re -import os import abc import copy import queue @@ -24,7 +23,6 @@ from ..log import get_module_logger from .cache import DiskDatasetCache -from .base import Feature, PFeature from ..utils import ( Wrapper, init_instance_by_config, diff --git a/qlib/data/ops.py b/qlib/data/ops.py index fd7eb94bb6..bdc032c037 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -1643,7 +1643,7 @@ def register_all_ops(C): """register all operator""" logger = get_module_logger("ops") - from qlib.data.pit import P + from qlib.data.pit import P # pylint: disable=C0415 Operators.reset() Operators.register(OpsList + [P]) diff --git a/qlib/data/pit.py b/qlib/data/pit.py index 2cf144c6ef..ebe01eaf26 100644 --- a/qlib/data/pit.py +++ b/qlib/data/pit.py @@ -17,11 +17,11 @@ import pandas as pd from qlib.data.ops import ElemOperator from qlib.log import get_module_logger +from .data import Cal class P(ElemOperator): def _load_internal(self, instrument, start_index, end_index, freq): - from .data import Cal _calendar = Cal.calendar(freq=freq) resample_data = np.empty(end_index - start_index + 1, dtype="float32") From 2671dc2a162e3501f7e874c793eeb3f99e86dbad Mon Sep 17 00:00:00 2001 From: Young Date: Thu, 10 Mar 2022 14:27:23 +0800 Subject: [PATCH 30/30] Rename --- docs/advanced/PIT.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/advanced/PIT.rst b/docs/advanced/PIT.rst index 728c98b442..f828a43e45 100644 --- a/docs/advanced/PIT.rst +++ b/docs/advanced/PIT.rst @@ -1,4 +1,4 @@ -.. _alpha: +.. _pit: =========================== (P)oint-(I)n-(T)ime Database