Help me in using Voting Classifier ML on Futures

magenta.grimer

Hi!

This is the default voting classifier strategy for cryptos:

import logging

import pandas as pd
import xarray as xr
import numpy as np

import qnt.backtester as qnbt
import qnt.ta as qnta

def create_model():
    """This is a constructor for the ML model which can be easily modified using
       different models or another logic for the combination.
    """
    
    from sklearn.ensemble import VotingClassifier
    from sklearn.linear_model import SGDClassifier, RidgeClassifier
    import random
    
    # We will use a model obtained combining by voting Ridge Classifiers and SGD Classifiers
    # which use several random seeds to reduce overfitting:
    classifiers = []
    r = random.Random(13)
    for i in range(42):
        classifiers.append(('ridge' + str(i), RidgeClassifier(random_state=r.randint(0, pow(2, 32) - 1)),))
        classifiers.append(('sgd' + str(i), SGDClassifier(random_state=r.randint(0, pow(2, 32) - 1)),))
    model = VotingClassifier(classifiers)

    return model

def get_features(data):
    """Builds the features used for learning:
       * a trend indicator;
       * the stochastic oscillator;
       * volatility;
       * volume.
    """
    
    trend = qnta.roc(qnta.lwma(data.sel(field='close'), 70), 1)

    # stochastic oscillator:
    k, d = qnta.stochastic(data.sel(field='high'), data.sel(field='low'), data.sel(field='close'), 14)

    volatility = qnta.tr(data.sel(field='high'), data.sel(field='low'), data.sel(field='close'))
    volatility = volatility / data.sel(field='close')
    volatility = qnta.lwma(volatility, 14)

    volume = data.sel(field='vol')
    volume = qnta.sma(volume, 5) / qnta.sma(volume, 60)
    volume = volume.where(np.isfinite(volume), 0)

    # combine the selected four features:
    result = xr.concat(
        [trend, d, volatility, volume],
        pd.Index(
            ['trend', 'stochastic_d', 'volatility', 'volume'],
            name = 'field'
        )
    )
    
    return result.transpose('time', 'field', 'asset')

def get_target_classes(data):
    """Builds target classes which will be later predicted."""

    price_current = data.sel(field='close')
    price_future = qnta.shift(price_current, -1)

    class_positive = 1
    class_negative = 0

    target_is_price_up = xr.where(price_future > price_current, class_positive, class_negative)
    
    return target_is_price_up

def create_and_train_models(data):
    """Create and train the models working on an asset-by-asset basis."""
    
    asset_name_all = data.coords['asset'].values

    data = data.sel(time=slice('2013-05-01',None)) # cut the noisy data head before 2013-05-01

    features_all = get_features(data)
    target_all = get_target_classes(data)

    models = dict()

    for asset_name in asset_name_all:
        
        # drop missing values:
        target_cur = target_all.sel(asset=asset_name).dropna('time', 'any')
        features_cur = features_all.sel(asset=asset_name).dropna('time', 'any')

        # align features and targets:
        target_for_learn_df, feature_for_learn_df = xr.align(target_cur, features_cur, join='inner')

        if len(features_cur.time) < 10:
            # not enough points for training
            continue

        model = create_model()
        
        try:
            model.fit(feature_for_learn_df.values, target_for_learn_df)
            models[asset_name] = model
        except KeyboardInterrupt as e:
            raise e
        except:
            logging.exception('model training failed')

    return models

def predict(models, data):
    """Performs prediction and generates output weights.
       Generation is performed for several days in order to speed 
       up the evaluation.
    """
    
    asset_name_all = data.coords['asset'].values
    weights = xr.zeros_like(data.sel(field='close'))
    
    for asset_name in asset_name_all:
        if asset_name in models:
            model = models[asset_name]
            features_all = get_features(data)
            features_cur = features_all.sel(asset=asset_name).dropna('time','any')
            if len(features_cur.time) < 1:
                continue
            try:
                weights.loc[dict(asset=asset_name,time=features_cur.time.values)] = model.predict(features_cur.values)
            except KeyboardInterrupt as e:
                raise e
            except:
                logging.exception('model prediction failed')
                
    return weights


weights = qnbt.backtest_ml(
    train=create_and_train_models,
    predict=predict,
    train_period=10*365,   # the data length for training in calendar days
    retrain_interval=365,  # how often we have to retrain models (calendar days)
    retrain_interval_after_submit=1, # how often retrain models after submission during evaluation (calendar days)
    predict_each_day=False,  # Is it necessary to call prediction for every day during backtesting?
                             # Set it to true if you suspect that get_features is looking forward.
    competition_type='cryptofutures',  # competition type
    lookback_period=365,      # how many calendar days are needed by the predict function to generate the output
    start_date='2014-01-01',  # backtest start date
    build_plots=True          # do you need the chart?
)

I would like to use it for futures markets, too.

Just using this does not seem to work:

import logging

import pandas as pd
import xarray as xr
import numpy as np

import qnt.backtester as qnbt
import qnt.ta as qnta

def create_model():
    """This is a constructor for the ML model which can be easily modified using
       different models or another logic for the combination.
    """
    
    from sklearn.ensemble import VotingClassifier
    from sklearn.linear_model import SGDClassifier, RidgeClassifier
    import random
    
    # We will use a model obtained combining by voting Ridge Classifiers and SGD Classifiers
    # which use several random seeds to reduce overfitting:
    classifiers = []
    r = random.Random(13)
    for i in range(42):
        classifiers.append(('ridge' + str(i), RidgeClassifier(random_state=r.randint(0, pow(2, 32) - 1)),))
        classifiers.append(('sgd' + str(i), SGDClassifier(random_state=r.randint(0, pow(2, 32) - 1)),))
    model = VotingClassifier(classifiers)

    return model

def get_features(data):
    """Builds the features used for learning:
       * a trend indicator;
       * the stochastic oscillator;
       * volatility;
       * volume.
    """
    
    trend = qnta.roc(qnta.lwma(data.sel(field='close'), 70), 1)

    # stochastic oscillator:
    k, d = qnta.stochastic(data.sel(field='high'), data.sel(field='low'), data.sel(field='close'), 14)

    volatility = qnta.tr(data.sel(field='high'), data.sel(field='low'), data.sel(field='close'))
    volatility = volatility / data.sel(field='close')
    volatility = qnta.lwma(volatility, 14)

    volume = data.sel(field='vol')
    volume = qnta.sma(volume, 5) / qnta.sma(volume, 60)
    volume = volume.where(np.isfinite(volume), 0)

    # combine the selected four features:
    result = xr.concat(
        [trend, d, volatility, volume],
        pd.Index(
            ['trend', 'stochastic_d', 'volatility', 'volume'],
            name = 'field'
        )
    )
    
    return result.transpose('time', 'field', 'asset')

def get_target_classes(data):
    """Builds target classes which will be later predicted."""

    price_current = data.sel(field='close')
    price_future = qnta.shift(price_current, -1)

    class_positive = 1
    class_negative = 0

    target_is_price_up = xr.where(price_future > price_current, class_positive, class_negative)
    
    return target_is_price_up

def create_and_train_models(data):
    """Create and train the models working on an asset-by-asset basis."""
    
    asset_name_all = data.coords['asset'].values

    data = data.sel(time=slice('2013-05-01',None)) # cut the noisy data head before 2013-05-01

    features_all = get_features(data)
    target_all = get_target_classes(data)

    models = dict()

    for asset_name in asset_name_all:
        
        # drop missing values:
        target_cur = target_all.sel(asset=asset_name).dropna('time', 'any')
        features_cur = features_all.sel(asset=asset_name).dropna('time', 'any')

        # align features and targets:
        target_for_learn_df, feature_for_learn_df = xr.align(target_cur, features_cur, join='inner')

        if len(features_cur.time) < 10:
            # not enough points for training
            continue

        model = create_model()
        
        try:
            model.fit(feature_for_learn_df.values, target_for_learn_df)
            models[asset_name] = model
        except KeyboardInterrupt as e:
            raise e
        except:
            logging.exception('model training failed')

    return models

def predict(models, data):
    """Performs prediction and generates output weights.
       Generation is performed for several days in order to speed 
       up the evaluation.
    """
    
    asset_name_all = data.coords['asset'].values
    weights = xr.zeros_like(data.sel(field='close'))
    
    for asset_name in asset_name_all:
        if asset_name in models:
            model = models[asset_name]
            features_all = get_features(data)
            features_cur = features_all.sel(asset=asset_name).dropna('time','any')
            if len(features_cur.time) < 1:
                continue
            try:
                weights.loc[dict(asset=asset_name,time=features_cur.time.values)] = model.predict(features_cur.values)
            except KeyboardInterrupt as e:
                raise e
            except:
                logging.exception('model prediction failed')
                
    return weights


weights = qnbt.backtest_ml(
    train=create_and_train_models,
    predict=predict,
    train_period=10*365,   # the data length for training in calendar days
    retrain_interval=365,  # how often we have to retrain models (calendar days)
    retrain_interval_after_submit=1, # how often retrain models after submission during evaluation (calendar days)
    predict_each_day=False,  # Is it necessary to call prediction for every day during backtesting?
                             # Set it to true if you suspect that get_features is looking forward.
    competition_type='futures',  # competition type
    lookback_period=365,      # how many calendar days are needed by the predict function to generate the output
    start_date='2014-01-01',  # backtest start date
    build_plots=True          # do you need the chart?
)

magenta.grimer

@magenta-grimer sorry now it seems to work as it is....