Commit 89e819a5 authored by wenwen.tang's avatar wenwen.tang 😕

推荐系统

parent c418191a
import json
from typing import List
import numpy as np
from py_jftech import autowired, parse_date, prev_workday, format_date
from ai.config import LABEL_RANGE, LABEL_TAG
from ai.dao import robo_predict
from ai.dao.robo_datas import get_base_info, get_index_list, get_fund_list
from ai.data_access import DataAccess
from ai.model_trainer import ModelTrainer
from ai.noticer import send
from ai.training_data_builder import TrainingDataBuilder
from api import DataSync
# 截止日期
max_date = None
# max_date = '2024-03-20'
# max_date = '2024-01-11'
toForecast = True # False means test, True means forecast
syncData = True # 开启会同步数据库指数及基金数据
uploadData = True # 开启会上传预测结果
doReport = True # 开启会生成Excel报告
# 待预测指数
# PREDICT_LIST = [67, 121, 122, 123]
PREDICT_LIST = [67, 121, 122, 123, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163,
164, 165, 166, 167, 168, 169, 170, 171, 174, 175, 177, 178]
eco = [65, 66, 74, 134]
index = [67, 68, 69, 70, 71, 72, 73, 75, 76, 77, 105, 106, 116, 117, 138, 139, 142, 143, 140, 141, 144, 145, 146]
fund = [121, 122, 123, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165,
166, 167, 168, 169, 170, 171, 174, 175, 177, 178]
@autowired
def sync(syncs: List[DataSync] = None):
for s in syncs:
# if isinstance(s, (IndexSync, EcoSync)):
s.do_sync()
def report_prediction(label, predict_item, indexDict: dict):
prediction = label
predictionStr = LABEL_TAG.get(prediction)
content = f"""\n On day {forecastDay.strftime("%m/%d/%Y")}, the model predicts {predict_item} to be {predictionStr} in {str(numForecastDays)} business days. \n"""
print(content)
# 上传预测结果
key = [k for k, v in indexDict.items() if v == predict_item][0]
index_info = get_base_info(key)[0]
if uploadData:
if len(LABEL_RANGE) > 2:
data = {"rbd_id": key, "date": forecastDay, "predict": prediction}
robo_predict.insert(data)
else:
from ai.noticer import upload_predict
upload_predict(index_info['ticker'], forecastDay, predictionStr)
send(content)
return prediction
def judgement(id, type, predict):
from datetime import datetime
predict_term = 21
start = parse_date(max_date) if max_date else prev_workday(datetime.today())
navs = []
if type == 'INDEX':
navs = get_index_list(index_ids=id, min_date=start, limit=predict_term)
navs = [nav['rid_close'] for nav in navs]
elif type == 'FUND':
navs = get_fund_list(fund_ids=id, min_date=start, limit=predict_term)
navs = [nav['rfn_nav_cal'] for nav in navs]
if len(navs) == predict_term:
upper = True if navs[-1] >= navs[0] else False
result = {}
for k, v, in predict.items():
pred = True if v[0] > 0 else False
if upper == pred:
result[k] = True
else:
result[k] = False
j = {
'id': id,
'date': format_date(start),
'result': result
}
with open('predict.txt', 'a+') as file:
file.write(json.dumps(j))
file.write('\n')
########################################
if __name__ == '__main__':
if syncData:
sync()
# define some parameters
win1W = 5 # 1 week
win1M = 21 # 1 Month
win1Q = 63 # 1 Quarter
numForecastDays = 21 # business days, 21 business days means one month
theThreshold = 0.0
ids = set(PREDICT_LIST) | set(eco) | set(index) | set(fund)
infos = get_base_info(ids)
infos_type = {info['id']: info['type'] for info in infos}
indexDict = {info['id']: info['ticker'].replace(' Index', '').replace(' Equity', '').replace(' ', '_') for info in
infos}
###################
# Step 1: Prepare X and y (features and labels)
# 准备基础数据
data_access = DataAccess(index, eco, fund, max_date, indexDict)
indexData = data_access.get_index_datas()
ecoData = data_access.get_eco_datas()
fundData = data_access.get_fund_datas()
# 指数数据准备
vixData = data_access.get_vix(indexData)
indexOtherData = data_access.get_other_index(indexData)
# 经济指标数据准备
cpiData = data_access.get_cpi(ecoData)
FDTRData = data_access.get_fdtr(ecoData)
# 新增指标 NAPMPMI :美國的ISM製造業指數 (Monthly)
NAPMPMIData = data_access.get_napmpmi(ecoData)
builder = TrainingDataBuilder(index, eco, fund, indexDict, toForecast, win1W, win1M, win1Q, numForecastDays,
theThreshold)
for pid in PREDICT_LIST:
print(f'{indexDict[pid]} start '.center(50, '='))
t_data = indexData if pid in index else fundData
X_train, X_test, y_train, y_test, scaledX_forecast, forecastDay = \
builder.build_train_test(pid, t_data, vixData, indexOtherData, cpiData, FDTRData, NAPMPMIData)
trainer = ModelTrainer(toForecast)
rf_model = trainer.train_random_forest(X_train, y_train, X_test, y_test)
gbt_model = trainer.train_GBT(X_train, y_train, X_test, y_test)
svc_model = trainer.train_SVC(X_train, y_train, X_test, y_test)
knn_model = trainer.train_nearest_neighbors(X_train, y_train, X_test, y_test)
ada_model = trainer.train_AdaBoost(X_train, y_train, X_test, y_test)
ensemble_model = trainer.ensemble_model(rf_model, gbt_model, svc_model,
knn_model, ada_model, X_train, y_train, X_test, y_test)
model_predict = {'forest': rf_model.predict(scaledX_forecast),
'gbt': gbt_model.predict(scaledX_forecast),
'svc': svc_model.predict(scaledX_forecast),
'knn': knn_model.predict(scaledX_forecast),
'adaboost': ada_model.predict(scaledX_forecast),
'ensemble': ensemble_model.predict(scaledX_forecast)}
print(f'预测结果:{model_predict}'.center(60, '+'))
judgement(pid, infos_type[pid], model_predict)
if toForecast:
if len(LABEL_RANGE) > 2:
average = round(np.mean(list(model_predict.values())))
report_prediction(average, indexDict[pid], indexDict)
else:
report_prediction(ensemble_model.predict(scaledX_forecast), indexDict[pid], indexDict)
if doReport:
if len(LABEL_RANGE) > 2:
from ai.reporter import do_reporter2
do_reporter2()
else:
from ai.reporter import do_reporter
do_reporter()
# 预测标签
from math import inf
LABEL_RANGE = {2: [0.05, inf], 1: [0.02, 0.05], 0: [-0.02, 0.02], -1: [-0.05, -0.02], -2: [-inf, -0.05]}
LABEL_TAG = {2: 'UPUP', 1: 'UP', 0: 'NEUTRAL', -1: 'DOWN', -2: 'DOWNDOWN'}
from py_jftech import read, to_tuple, where
@read
def get_index_list(index_ids=None, min_date=None, max_date=None, limit=None):
limit_sql = f'limit {limit}' if limit else ''
sqls = []
if min_date:
sqls.append(f"rid_date >= '{min_date}'")
if max_date:
sqls.append(f"rid_date <= '{max_date}'")
return f'''
select * from robo_index_datas
{where(*sqls, rid_index_id=to_tuple(index_ids))} order by rid_index_id, rid_date {limit_sql}
'''
@read
def get_eco_list(eco_ids=None, min_date=None, max_date=None):
sqls = []
if min_date:
sqls.append(f"red_date >= '{min_date}'")
if max_date:
sqls.append(f"red_date <= '{max_date}'")
return f'''
select * from robo_eco_datas
{where(*sqls, red_eco_id=to_tuple(eco_ids))} order by red_eco_id, red_date
'''
@read
def get_fund_list(fund_ids=None, min_date=None, max_date=None, limit=None):
limit_sql = f'limit {limit}' if limit else ''
sqls = []
if min_date:
sqls.append(f"rfn_date >= '{min_date}'")
if max_date:
sqls.append(f"rfn_date <= '{max_date}'")
return f'''
select * from robo_fund_navs
{where(*sqls, rfn_fund_id=to_tuple(fund_ids))} order by rfn_fund_id, rfn_date {limit_sql}
'''
@read
def get_base_info(ids=None):
sqls = []
return f"""
SELECT rbd_id id,v_rbd_bloomberg_ticker ticker,v_rbd_type type, rbd_datas datas FROM `robo_base_datum`
{where(*sqls, rbd_id=to_tuple(ids))}
"""
from py_jftech import read, write, format_date, to_tuple, where, mapper_columns
__COLUMNS__ = {
'rp_rbd_id': 'rbd_id',
'rp_date': 'date',
'rp_predict': 'predict',
'rp_remark': 'remark',
'rp_create_time': 'create_time'
}
@write
def insert(datas):
datas = mapper_columns(datas=datas, columns=__COLUMNS__)
return f'''
replace into robo_predict({','.join([x for x in datas.keys()])})
values ({','.join([f"'{x[1]}'" for x in datas.items()])})
'''
@read
def get_list(index_ids: object = None, min_date: object = None, max_date: object = None) -> object:
sqls = []
if min_date:
sqls.append(f"rp_date >= '{format_date(min_date)}'")
if max_date:
sqls.append(f"rp_date <= '{format_date(max_date)}'")
return f'''
select {','.join([f"{x[0]} as {x[1]}" for x in __COLUMNS__.items()])} from robo_predict
{where(*sqls, rid_index_id=to_tuple(index_ids))} order by rp_rbd_id, rp_date
'''
CREATE TABLE IF NOT EXISTS robo_predict (
`rp_rbd_id` bigint(20) NOT NULL,
`rp_date` datetime NOT NULL,
`rp_predict` int(11) NOT NULL,
`rp_remark` json NULL,
`rp_create_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
`rp_update_time` datetime NULL DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
INDEX `rp_rbd_id`(`rp_rbd_id`, `rp_date`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_general_ci ROW_FORMAT = Dynamic;
\ No newline at end of file
from abc import ABC
import pandas as pd
from ai.dao.robo_datas import get_eco_list, get_fund_list, get_index_list
class DataAccess(ABC):
def __init__(self, index, eco, fund, max_date, indexDict) -> None:
super().__init__()
self._index = index
self._eco = eco
self._fund = fund
self._max_date = max_date
self._indexDict = indexDict
def get_index_datas(self):
indexData = pd.DataFrame(
get_index_list(index_ids=self._index, max_date=self._max_date))
# todo erp 没有数据 "rid_erp",
indexData = indexData[
["rid_index_id", "rid_date", "rid_high", "rid_open", "rid_low", "rid_close", "rid_pe", "rid_pb",
"rid_volume", "rid_frdpe", "rid_frdpes", "rid_pc"]]
# 数据替换和截取以后用105 和106,(都从2022.1.3开始)然后2022年以前的部分:105前边接76的数据 106前边接77的数据
condition1 = ((indexData['rid_index_id'] == 76) & (indexData['rid_date'] >= '2022-01-03'))
condition2 = ((indexData['rid_index_id'] == 77) & (indexData['rid_date'] >= '2022-01-03'))
condition3 = ((indexData['rid_index_id'] == 105) & (indexData['rid_date'] < '2022-01-03'))
condition4 = ((indexData['rid_index_id'] == 106) & (indexData['rid_date'] < '2022-01-03'))
indexData.drop(indexData[condition1 | condition2 | condition3 | condition4].index, inplace=True)
indexData.loc[indexData['rid_index_id'] == 76, 'rid_index_id'] = 105
indexData.loc[indexData['rid_index_id'] == 77, 'rid_index_id'] = 106
indexData.rename(columns={"rid_date": 'date'}, inplace=True) # please use 'date'
indexData["rid_index_id"] = indexData["rid_index_id"].map(self._indexDict)
return indexData
def get_eco_datas(self):
ecoData = pd.DataFrame(
get_eco_list(eco_ids=self._eco, max_date=self._max_date))
ecoData = ecoData[["red_eco_id", "red_date", "red_indicator"]]
ecoData.rename(columns={"red_date": 'date'}, inplace=True) # please use 'date'
ecoData["red_eco_id"] = ecoData["red_eco_id"].map(self._indexDict)
return ecoData
def get_fund_datas(self):
fundData = pd.DataFrame(
get_fund_list(fund_ids=self._fund, max_date=self._max_date))
fundData = fundData[["rfn_fund_id", "rfn_date", "rfn_nav_cal"]]
fundData.rename(columns={"rfn_date": 'date'}, inplace=True) # please use 'date'
fundData["rfn_fund_id"] = fundData["rfn_fund_id"].map(self._indexDict)
return fundData
def get_vix(self, indexData):
# VIX:芝加哥期权交易所SPX波动率指
vixData = indexData[indexData['rid_index_id'] == "VIX"].copy()
vixData = vixData[
["date", "rid_high", "rid_open", "rid_low", "rid_close", "rid_pc", "rid_pb", "rid_pe", "rid_frdpe",
"rid_frdpes"]]
vixData.rename(
columns={"rid_high": 'vix_high', 'rid_open': 'vix_open', "rid_low": 'vix_low', "rid_volume": 'vix_volume',
"rid_close": 'vix_close', "rid_pc": 'vix_pc', "rid_pb": 'vix_pb', "rid_pe": 'vix_pe',
"rid_frdpe": 'vix_frdpe', "rid_frdpes": 'vix_frdpes'},
inplace=True)
vixData.set_index('date', inplace=True)
vixData.index = pd.to_datetime(vixData.index)
vixData.dropna(axis=1, inplace=True)
return vixData
def get_other_index(self, indexData):
other_index = ["USGG10YR", "USGG2YR", "CCMP", "TSFR1M", "TSFR12M", "COI_TOTL", "LEI_TOTL", "MID",
"OE4EKLAC", "OEA5KLAC", "OECNKLAC", "OEJPKLAC", "OEOTGTAC", "OEUSKLAC", "USRINDEX", "SPX"]
cols = ['date', 'rid_close', 'rid_pe', 'rid_pb', 'rid_volume', 'rid_frdpe', 'rid_frdpes', 'rid_pc']
indexOtherData = pd.DataFrame()
idxs = [self._indexDict[i] for i in self._index]
for idx in other_index:
if idx in idxs:
idx_data = indexData[indexData['rid_index_id'] == idx].copy()
idx_data = idx_data[cols]
idx_data.rename(
columns={"rid_close": f'{idx}_close', 'rid_pe': f'{idx}_pe', 'rid_pb': f'{idx}_pb',
'rid_volume': f'{idx}_volume', 'rid_frdpe': f'{idx}_frdpe', 'rid_frdpes': f'{idx}_frdpes',
'rid_pc': f'{idx}_pc'},
inplace=True)
idx_data.set_index('date', inplace=True)
idx_data.index = pd.to_datetime(idx_data.index)
if indexOtherData.size > 0:
indexOtherData = pd.merge(indexOtherData, idx_data, how='outer', on='date')
else:
indexOtherData = idx_data
indexOtherData.ffill(inplace=True)
indexOtherData.bfill(inplace=True)
indexOtherData = indexOtherData.dropna(axis=1)
return indexOtherData
def get_cpi(self, ecoData):
# CPI_YOY:美国城镇消费物价指数同比未经季 CPURNSA:美国消费者物价指数未经季调
cpiData = ecoData[(ecoData['red_eco_id'] == "CPI_YOY") | (ecoData['red_eco_id'] == "CPURNSA")].copy()
cpiData = cpiData.pivot(index='date', columns='red_eco_id', values='red_indicator')
cpiData['CPI_MOM'] = (cpiData['CPURNSA'] / cpiData['CPURNSA'].shift(
1) - 1.0) * 100 * 12 # Annualized Percentage
cpiData['CPI_MOM_Diff'] = cpiData['CPURNSA'] - cpiData['CPURNSA'].shift(1)
cpiData.index = pd.to_datetime(cpiData.index)
return cpiData
def get_fdtr(self, ecoData):
# FDTR 美国联邦基金目标利率
FDTRData = ecoData[ecoData['red_eco_id'] == "FDTR"].copy()
del (FDTRData['red_eco_id'])
FDTRData.rename(columns={"red_indicator": 'FDTR'}, inplace=True)
FDTRData.set_index('date', inplace=True)
FDTRData.index = pd.to_datetime(FDTRData.index)
return FDTRData
def get_napmpmi(self, ecoData):
# 新增指标 NAPMPMI :美國的ISM製造業指數 (Monthly)
NAPMPMIData = ecoData[ecoData['red_eco_id'] == "NAPMPMI"].copy()
del (NAPMPMIData['red_eco_id'])
NAPMPMIData.rename(columns={"red_indicator": 'NAPMPMI'}, inplace=True)
NAPMPMIData.set_index('date', inplace=True)
NAPMPMIData.index = pd.to_datetime(NAPMPMIData.index)
return NAPMPMIData
from abc import ABC
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from ai.config import LABEL_RANGE
class ModelTrainer(ABC):
"""
模型训练类
"""
def __init__(self, toForecast) -> None:
super().__init__()
self._toForecast = toForecast
###################
# Step 3: Train the model
def test_model(self, strMethod, classifier, X_test, y_test):
print(strMethod + " ====== test results ======")
y_pred = classifier.predict(X_test)
labels = list(LABEL_RANGE.keys())[::-1]
result0 = confusion_matrix(y_test, y_pred, labels=labels)
print(strMethod + " Confusion Matrix:")
print(result0)
result1 = classification_report(y_test, y_pred, zero_division=1.0)
print(strMethod + " Classification Report:")
print(result1)
result2 = accuracy_score(y_test, y_pred)
print(strMethod + " Accuracy:", result2)
cm_display = ConfusionMatrixDisplay(confusion_matrix=result0, display_labels=labels)
cm_display.plot()
plt.title(strMethod + ' Accuracy: ' + f'{result2:.0%}')
plt.show()
def train_random_forest(self, X_train, y_train, X_test, y_test):
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)
if not self._toForecast:
self.test_model('Random Forest', classifier, X_test, y_test)
return classifier
def train_GBT(self, X_train, y_train, X_test, y_test):
# Gradient Boosted Tree
classifierGBT = LGBMClassifier()
classifierGBT.fit(X_train, y_train)
if not self._toForecast:
self.test_model('Gradient Boosted Tree', classifierGBT, X_test, y_test)
return classifierGBT
def train_SVC(self, X_train, y_train, X_test, y_test):
# Support Vector Machines
classifierSVC = svm.SVC()
classifierSVC.fit(X_train, y_train)
if not self._toForecast:
self.test_model('Support Vector Machines', classifierSVC, X_test, y_test)
return classifierSVC
def train_nearest_neighbors(self, X_train, y_train, X_test, y_test):
classifier = KNeighborsClassifier()
classifier.fit(X_train, y_train)
if not self._toForecast:
self.test_model('K-Nearest Neighbors', classifier, X_test, y_test)
return classifier
def train_AdaBoost(self, X_train, y_train, X_test, y_test):
classifier = AdaBoostClassifier()
classifier.fit(X_train, y_train)
if not self._toForecast:
self.test_model('AdaBoost', classifier, X_test, y_test)
return classifier
def ensemble_model(self, rf_model, gbt_model, svc_model, knn_model,
ada_model, X_train, y_train, X_test, y_test):
# Create a dictionary of our models
estimators = [('rf', rf_model), ('gbt', gbt_model), ('svc', svc_model),
('knn', knn_model), ('AdaBoost', ada_model)]
# Create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting='hard')
# fit model to training data
ensemble.fit(X_train, y_train)
if not self._toForecast:
self.test_model('Ensemble Model', ensemble, X_test, y_test)
return ensemble
from datetime import datetime
import requests
from py_jftech import sendmail, format_date
# 预测发送邮箱
email = ['wenwen.tang@thizgroup.com']
jrp_domain = 'https://jrp.jfquant.com/api/v1.0'
# jrp_domain = 'http://localhost:7090/jrp'
def send(content):
receives = email
subject = '预测_{today}'.format(today=format_date(datetime.today()))
sendmail(receives=receives, copies=[], attach_paths=[], subject=subject, content=content)
def upload_predict(ticker, predictDate, predict):
predict_data = {
"aiPredict": {
"predictDate": format_date(predictDate),
"predict": 1 if predict == 'UP' else -1
},
"bloombergTicker": ticker
}
headers = {"X-AUTH-token": "rt7297LwQvyAYTke2iD8Vg"}
response = requests.post(url=f'{jrp_domain}/ai/predict', json=predict_data, headers=headers)
if response.status_code != 200:
print(response.text)
print("上传ai预测结果失败,请重试")
{"id": 67, "date": "2023-10-06", "result": {"forest": false, "gbt": true, "svc": true, "ensemble": true}}
{"id": 121, "date": "2023-10-06", "result": {"forest": true, "gbt": true, "svc": false, "ensemble": true}}
{"id": 122, "date": "2023-10-06", "result": {"forest": true, "gbt": true, "svc": false, "ensemble": true}}
{"id": 123, "date": "2023-10-06", "result": {"forest": false, "gbt": false, "svc": true, "ensemble": false}}
{"id": 67, "date": "2023-10-13", "result": {"forest": false, "gbt": true, "svc": true, "ensemble": true}}
{"id": 121, "date": "2023-10-13", "result": {"forest": false, "gbt": false, "svc": false, "ensemble": false}}
{"id": 122, "date": "2023-10-13", "result": {"forest": false, "gbt": false, "svc": false, "ensemble": false}}
{"id": 123, "date": "2023-10-13", "result": {"forest": false, "gbt": false, "svc": true, "ensemble": false}}
{"id": 67, "date": "2023-10-20", "result": {"forest": false, "gbt": false, "svc": true, "ensemble": false}}
{"id": 121, "date": "2023-10-20", "result": {"forest": false, "gbt": true, "svc": false, "ensemble": true}}
{"id": 122, "date": "2023-10-20", "result": {"forest": true, "gbt": true, "svc": false, "ensemble": true}}
{"id": 123, "date": "2023-10-20", "result": {"forest": false, "gbt": false, "svc": true, "ensemble": false}}
{"id": 67, "date": "2023-10-27", "result": {"forest": false, "gbt": true, "svc": true, "ensemble": true}}
{"id": 121, "date": "2023-10-27", "result": {"forest": false, "gbt": false, "svc": false, "ensemble": false}}
{"id": 122, "date": "2023-10-27", "result": {"forest": false, "gbt": false, "svc": false, "ensemble": false}}
{"id": 123, "date": "2023-10-27", "result": {"forest": false, "gbt": false, "svc": true, "ensemble": false}}
{"id": 67, "date": "2023-11-03", "result": {"forest": true, "gbt": true, "svc": true, "ensemble": true}}
{"id": 121, "date": "2023-11-03", "result": {"forest": false, "gbt": false, "svc": false, "ensemble": false}}
{"id": 122, "date": "2023-11-03", "result": {"forest": false, "gbt": false, "svc": false, "ensemble": false}}
{"id": 123, "date": "2023-11-03", "result": {"forest": false, "gbt": false, "svc": false, "ensemble": false}}
{"id": 67, "date": "2023-11-17", "result": {"forest": false, "gbt": false, "svc": true, "ensemble": false}}
{"id": 121, "date": "2023-11-17", "result": {"forest": false, "gbt": false, "svc": false, "ensemble": false}}
{"id": 122, "date": "2023-11-17", "result": {"forest": false, "gbt": false, "svc": false, "ensemble": false}}
{"id": 123, "date": "2023-11-17", "result": {"forest": false, "gbt": false, "svc": false, "ensemble": false}}
{"id": 67, "date": "2023-11-24", "result": {"forest": false, "gbt": false, "svc": true, "ensemble": false}}
{"id": 121, "date": "2023-11-24", "result": {"forest": false, "gbt": false, "svc": false, "ensemble": false}}
{"id": 122, "date": "2023-11-24", "result": {"forest": false, "gbt": false, "svc": false, "ensemble": false}}
{"id": 123, "date": "2023-11-24", "result": {"forest": false, "gbt": false, "svc": false, "ensemble": false}}
{"id": 67, "date": "2023-12-01", "result": {"forest": false, "gbt": false, "svc": true, "ensemble": false}}
{"id": 121, "date": "2023-12-01", "result": {"forest": false, "gbt": false, "svc": false, "ensemble": false}}
{"id": 122, "date": "2023-12-01", "result": {"forest": false, "gbt": false, "svc": false, "ensemble": false}}
{"id": 123, "date": "2023-12-01", "result": {"forest": false, "gbt": false, "svc": false, "ensemble": false}}
{"id": 67, "date": "2023-12-08", "result": {"forest": false, "gbt": false, "svc": true, "ensemble": false}}
{"id": 121, "date": "2023-12-08", "result": {"forest": false, "gbt": false, "svc": false, "ensemble": false}}
{"id": 122, "date": "2023-12-08", "result": {"forest": false, "gbt": false, "svc": false, "ensemble": false}}
{"id": 123, "date": "2023-12-08", "result": {"forest": false, "gbt": false, "svc": false, "ensemble": false}}
{"id": 67, "date": "2023-12-15", "result": {"forest": false, "gbt": false, "svc": true, "ensemble": false}}
{"id": 121, "date": "2023-12-15", "result": {"forest": true, "gbt": true, "svc": true, "ensemble": true}}
{"id": 122, "date": "2023-12-15", "result": {"forest": false, "gbt": false, "svc": true, "ensemble": false}}
{"id": 123, "date": "2023-12-15", "result": {"forest": true, "gbt": true, "svc": true, "ensemble": true}}
import datetime
import json
import pandas as pd
import requests
from ai.config import LABEL_TAG
from ai.dao import robo_predict
from ai.dao.robo_datas import get_base_info
symbols = ['ACWI', 'EWJ', 'MCHI', 'EEM', 'BKF', 'INDA', 'AAXJ', 'VGK', 'QQQ', 'SPY', 'SPX', 'IWN',
'IUSG', 'IWD', 'DON', 'GDX', 'TOLZ', 'XLU', 'XBI', 'ESGD', 'IGE', 'EMLC', 'IGAA',
'LQD', 'HYG', 'SHY', 'IEI', 'IEF', 'GLD', 'IYR', 'UUP', 'CEW', 'TLT']
def do_reporter(start='2023-10-01', end=datetime.date.today()):
url = f"https://jrp.jfquant.com/api/v1.0/ai/predict?startTime={start}&endTime={end.strftime('%Y-%m-%d')}"
resp = requests.get(url)
datas = []
symbol_index_dict = {symbol: index for index, symbol in enumerate(symbols)}
for value in resp.json()['body'].values():
for item in value:
data = {
'Forcast On Date': item['aiPredict']['predictDate'],
'Ticker': item['bloombergTicker'].replace(' Index', '').replace(' Equity', ''),
'In 21 business days': 'UP' if item['aiPredict']['predict'] == 1 else 'DOWN',
'Ticker Name': item['indexName'],
}
datas.append(data)
sorted_data = sorted(datas, key=lambda x: symbol_index_dict[x['Ticker'].split(' ')[0]])
print(json.dumps(sorted_data, ensure_ascii=False))
pf = pd.DataFrame(sorted_data)
pf.to_excel("Forcast_Report.xlsx", index=False)
def do_reporter2():
datas = []
index_info = get_base_info()
info = {x['id']: x for x in index_info}
symbol_index_dict = {symbol: index for index, symbol in enumerate(symbols)}
records = robo_predict.get_list()
for item in records:
data = {
'Forcast On Date': item['date'],
'Ticker': info.get(item['rbd_id'])['ticker'].replace(' Index', '').replace(' Equity', ''),
'In 21 business days': LABEL_TAG.get(item['predict']),
'Ticker Name': json.loads(info.get(item['rbd_id'])['datas'])['chineseName'],
}
datas.append(data)
sorted_data = sorted(datas, key=lambda x: symbol_index_dict[x['Ticker'].split(' ')[0]])
pf = pd.DataFrame(sorted_data)
pf.to_excel("Forcast_Report.xlsx", index=False)
if __name__ == '__main__':
do_reporter2()
from abc import ABC
import numpy as np
import pandas as pd
from finta import TA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from ai.config import LABEL_RANGE
def imp():
print(TA)
class TrainingDataBuilder(ABC):
def __init__(self, index, eco, fund, indexDict, toForecast, win1W, win1M, win1Q, numForecastDays,
theThreshold) -> None:
super().__init__()
self._index = index
self._eco = eco
self._fund = fund
self._indexDict = indexDict
self._toForecast = toForecast
self._win1W = win1W # 1 week
self._win1M = win1M # 1 Month
self._win1Q = win1Q # 1 Quarter
self._numForecastDays = numForecastDays # business days, 21 business days means one month
self._theThreshold = theThreshold
# List of symbols for technical indicators
# INDICATORS = ['RSI', 'MACD', 'STOCH','ADL', 'ATR', 'MOM', 'MFI', 'ROC', 'OBV', 'CCI', 'EMV', 'VORTEX']
# Note that '14 period MFI' and '14 period EMV' is not available for forecast
self.INDICATORS = ['RSI', 'MACD', 'STOCH', 'ADL', 'ATR', 'MOM', 'ROC', 'OBV', 'CCI', 'VORTEX']
self.FUND_INDICATORS = []
def get_indicator_data(self, data, pid):
"""
Function that uses the finta API to calculate technical indicators used as the features
"""
def indicator_calcu(data, indicators):
"""
指数和基金不同,基金只有收盘价,生成指标会变少
@param data:
@param indicators:
@return:
"""
for indicator in indicators:
ind_data = eval('TA.' + indicator + '(data)')
if not isinstance(ind_data, pd.DataFrame):
ind_data = ind_data.to_frame()
data = data.merge(ind_data, left_index=True, right_index=True)
return data
if pid in self._index:
data = indicator_calcu(data, self.INDICATORS)
# Instead of using the actual volume value (which changes over time), we normalize it with a moving volume average
data['normVol'] = data['volume'] / data['volume'].ewm(5).mean()
# get relative values
data['relativeOpen'] = data['open'] / data['close'].shift(1)
data['relativeHigh'] = data['high'] / data['close'].shift(1)
data['relativeLow'] = data['low'] / data['close'].shift(1)
# Remove columns that won't be used as features
# data['close'] are still needed and will be deleted later
data.drop(['open', 'high', 'low', 'volume'], axis=1, inplace=True)
elif pid in self._fund:
indicator_calcu(data, self.FUND_INDICATORS)
# Also calculate moving averages for features
data['ema50'] = data['close'] / data['close'].ewm(50).mean()
data['ema21'] = data['close'] / data['close'].ewm(21).mean()
data['ema15'] = data['close'] / data['close'].ewm(15).mean()
data['ema5'] = data['close'] / data['close'].ewm(5).mean()
data['relativeClose'] = data['close'] / data['close'].shift(1)
return data
def build_predict_data(self, indexData, pid):
def map_to_label(ret):
for label, (lower, upper) in LABEL_RANGE.items():
if float(lower) <= ret < float(upper):
return label
"""
@param pid: 需要预测的指数或基金id
@return:
"""
if pid in self._index:
###### get individual data from raw data
predictData = indexData[indexData['rid_index_id'] == self._indexDict[pid]].copy()
del (predictData['rid_index_id'])
###### Additional preparing SPX Data
# finta expects properly formated ohlc DataFrame, with column names in lowercase:
# ["open", "high", "low", close"] and ["volume"] for indicators that expect ohlcv input.
predictData.rename(
columns={"rid_high": 'high', 'rid_open': 'open', "rid_low": 'low', "rid_close": 'close',
'rid_volume': 'volume',
"rid_pe": f"{self._indexDict[pid]}_pe", "rid_pb": f"{self._indexDict[pid]}_pb"},
inplace=True)
elif pid in self._fund:
predictData = indexData[indexData['rfn_fund_id'] == self._indexDict[pid]].copy()
del (predictData['rfn_fund_id'])
predictData.rename(columns={"rfn_nav_cal": 'close'}, inplace=True)
predictData.set_index('date', inplace=True)
predictData.index = pd.to_datetime(predictData.index)
predictData.sort_index(inplace=True)
predictData.reset_index(inplace=True)
# Calculate the indicator data
predictData = self.get_indicator_data(predictData, pid)
# Calculate Historical Return and Volatility
predictData['R1W'] = np.log(predictData['close'] / predictData['close'].shift(self._win1W))
predictData['R1M'] = np.log(predictData['close'] / predictData['close'].shift(self._win1M))
predictData['R1Q'] = np.log(predictData['close'] / predictData['close'].shift(self._win1Q))
price_list = predictData['close']
rollist = price_list.rolling(self._win1W)
predictData['Vol_1W'] = rollist.std(ddof=0)
rollist = price_list.rolling(self._win1M)
predictData['Vol_1M'] = rollist.std(ddof=0)
rollist = price_list.rolling(self._win1Q)
predictData['Vol_1Q'] = rollist.std(ddof=0)
# The following uses future info for the y label, to be deleted later
predictData['futureR'] = np.log(predictData['close'].shift(-self._numForecastDays) / predictData['close'])
# predictData = predictData[predictData['futureR'].notna()]
predictData['yLabel'] = predictData['futureR'].apply(lambda r: map_to_label(r))
del (predictData['close'])
return predictData
def build_train_test(self, pid, indexData, vixData, indexOtherData, cpiData, FDTRData, NAPMPMIData):
###### Merge Data to one table
predictData = self.build_predict_data(indexData, pid)
forecastDay = None
if (self._toForecast):
forecastDay = predictData['date'].iloc[-1]
DataAll = pd.merge(predictData, vixData, how='outer', on='date')
DataAll = pd.merge(DataAll, indexOtherData, how='outer', on='date')
DataAll = pd.merge(DataAll, cpiData, how='outer', on='date')
DataAll = pd.merge(DataAll, FDTRData, how='outer', on='date')
DataAll = pd.merge(DataAll, NAPMPMIData, how='outer', on='date')
DataAll.set_index('date', inplace=True)
DataAll.sort_index(inplace=True)
DataAll.reset_index(inplace=True)
###### fill eco data
for col in ['CPI_YOY', 'CPURNSA', 'CPI_MOM', 'CPI_MOM_Diff']:
DataAll[col].bfill(inplace=True)
for col in ['FDTR']:
DataAll[col].ffill(inplace=True)
# 新增指数NAPMPMI :美國的ISM製造業指數 (Monthly)
for col in ['NAPMPMI']:
DataAll[col].bfill(inplace=True)
DataAll[col].ffill(inplace=True)
for col in DataAll.columns:
if col not in ['CPI_YOY', 'CPURNSA', 'CPI_MOM', 'CPI_MOM_Diff', 'futureR', 'yLabel']:
DataAll[col].ffill(inplace=True)
if (self._toForecast):
# 处理CPI_YOY:美国城镇消费物价指数同比未经季 CPURNSA:美国消费者物价指数未经季调
DataAllCopy = DataAll.copy()
for col in ['CPI_YOY', 'CPURNSA']:
DataAllCopy[col].ffill(inplace=True)
for col in ['CPI_MOM', 'CPI_MOM_Diff']:
DataAllCopy[col] = DataAllCopy[col].fillna(0)
DataAllCopy.drop(['futureR', 'yLabel'], axis=1, inplace=True)
forecastDayIndex = DataAllCopy.index[DataAllCopy['date'] == forecastDay]
forecastData = DataAllCopy.iloc[forecastDayIndex.to_list(), 1:]
forecastData.dropna(inplace=True, axis=1)
X_forecast = forecastData.to_numpy()
del DataAllCopy
###### clean NaN
DataAll.dropna(inplace=True)
DataAll.reset_index(inplace=True, drop=True)
###### get X and y
y = DataAll['yLabel'].to_numpy(copy=True)
# delete future information
DataAll.drop(['futureR', 'yLabel'], axis=1, inplace=True)
X = DataAll.iloc[:, 1:].values
###################
# scale data
labels = list(LABEL_RANGE.keys())
scaler = MinMaxScaler(feature_range=(labels[-1], labels[0]))
# scaledX = scaler.fit_transform(X)
DataScaler = scaler.fit(X)
scaledX = DataScaler.transform(X)
scaledX_forecast = None
if (self._toForecast):
scaledX_forecast = DataScaler.transform(X_forecast)
X_train = scaledX
y_train = y
X_test = []
y_test = []
else:
# Step 2: Split data into train set and test set
X_train, X_test, y_train, y_test = train_test_split(scaledX, y, test_size=0.02, shuffle=False)
# To avoid data leak, test set should start from numForecastDays later
X_test = X_test[self._numForecastDays:]
y_test = y_test[self._numForecastDays:]
return X_train, X_test, y_train, y_test, scaledX_forecast, forecastDay
......@@ -23,12 +23,3 @@ urllib3==1.26.12
fastapi==0.100.0
uvicorn==0.23.1
apscheduler==3.10.1
\ No newline at end of file
sklearn
finta
keras
tensorflow
matplotlib
lightgbm
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment