销量预测04（数据的初步处理：使用机器学习，在此考虑了时间窗作为输入）

导入计算库

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
plt.style.use("fivethirtyeight")
plt.rcParams["font.sans-serif"] = ["Microsoft YaHei"]
plt.rcParams["axes.unicode_minus"] = False
from sklearn.preprocessing import OneHotEncoder

import statsmodels.api as sm 
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from pandas.tseries.offsets import DateOffset


from statsmodels.tsa.arima.model import ARIMA

import warnings
warnings.filterwarnings("ignore")

导入数据

path_train = "../preocess_data/train_data_o.csv"
path_test = "../data/test_data.csv"
data  = pd.read_csv(path_train)
data_test = pd.read_csv(path_test)
data["运营日期"] = pd.to_datetime(data["运营日期"] )
data_test["运营日期"] = pd.to_datetime(data_test["日期"])

data.drop(["行ID","日期"],axis=1,inplace=True) 
data_test.drop(["行ID","日期"],axis=1,inplace=True)

折扣编码

enc = OneHotEncoder(drop="if_binary")
enc.fit(data["折扣"].values.reshape(-1,1))
enc.transform(data["折扣"].values.reshape(-1,1)).toarray()
enc.transform(data_test["折扣"].values.reshape(-1,1)).toarray()

array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [0.],
       [0.]])

data["折扣"] = enc.transform(data["折扣"].values.reshape(-1,1)).toarray()
data_test["折扣"]  = enc.transform(data_test["折扣"].values.reshape(-1,1)).toarray()

日期衍生

def time_derivation(t,col="运营日期"):
    t["year"] = t[col].dt.year
    t["month"] = t[col].dt.month
    t["day"] = t[col].dt.day
    t["quarter"] = t[col].dt.quarter
    t["weekofyear"] = t[col].dt.weekofyear
    t["dayofweek"] = t[col].dt.dayofweek+1
    t["weekend"] = (t["dayofweek"]>5).astype(int)
    return t

data_train  = time_derivation(data)
data_test_  = time_derivation(data_test)

对每家店进行探索

对第一家尝试

# 训练集合
data_train_1 = data_train[data_train["商店ID"] ==1]
# 测试集合
data_test_1 = data_test_[data_test_["商店ID"] ==1]

plt.figure(figsize=(16,8))
plt.plot(data_train_1["运营日期"],data_train_1["销量"])
plt.xlabel("日期",fontsize= 20)
plt.ylabel("销量",fontsize= 20)
plt.title("1号店的销量",fontsize=20)

Text(0.5, 1.0, '1号店的销量')

png

开始尝试

data_train_1.head()

	商店ID	商店类型	位置	地区	节假日	折扣	销量	运营日期	year	month	day	quarter	weekofyear	dayofweek
0	1	S1	L3	R1	1	1.0	7011.84	2018-01-01	2018	1	1	1	1	1
607	1	S1	L3	R1	0	1.0	42369.00	2018-01-02	2018	1	2	1	1	2
1046	1	S1	L3	R1	0	1.0	50037.00	2018-01-03	2018	1	3	1	1	3
1207	1	S1	L3	R1	0	1.0	44397.00	2018-01-04	2018	1	4	1	1	4
1752	1	S1	L3	R1	0	1.0	47604.00	2018-01-05	2018	1	5	1	1	5

复制获得数据

data_train__1 = data_train_1.copy()
data_test__1 = data_test_1.copy()

删除不变的属性

data_train__1.drop(["商店类型","商店ID","位置","地区","运营日期"],axis=1,inplace=True)
data_test__1.drop(["商店类型","商店ID","位置","地区","运营日期"],axis=1,inplace=True)

在这里留下一个思考的点，其实节假日的附近几天的属性也会对预测结果产生一定影响。

data_train__1.shape

(516, 10)

data_train__1

	节假日	折扣	销量	year	month	day	quarter	weekofyear	dayofweek	weekend
0	1	1.0	7011.84	2018	1	1	1	1	1	0
607	0	1.0	42369.00	2018	1	2	1	1	2	0
1046	0	1.0	50037.00	2018	1	3	1	1	3	0
1207	0	1.0	44397.00	2018	1	4	1	1	4	0
1752	0	1.0	47604.00	2018	1	5	1	1	5	0
…	…	…	…	…	…	…	…	…	…	…
186569	0	1.0	33075.00	2019	5	27	2	22	1	0
187165	0	1.0	37317.00	2019	5	28	2	22	2	0
187391	0	1.0	44652.00	2019	5	29	2	22	3	0
187962	0	1.0	42387.00	2019	5	30	2	22	4	0
188113	1	1.0	39843.78	2019	5	31	2	22	5	0

516 rows × 10 columns

data_train__1["diff_1"] = data_train__1["销量"].diff(10)
data_train__1 = data_train__1.dropna()

# data_test__1["diff_1"] = data_test__1["销量"].diff(1)
# data_test__1 = data_test__1.dropna()

将数据集变为有监督数据集

def series_to_supervisied_(data,step_in,step_out,dropnan = True):
    """
    :param data: 观测的序列，类型为列表或者二维的numpy数组
    :param step_in: 作为输入滞后观测数量（x）
    :param step_out: 作为输出的观测值数量（y）
    :param  dropnan: 是否具有Nan的行，默认为True
    return 监督学习的重组得到的dataframe列
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = data
    cols = []
    names = []
    # 输入序列[(t-n),(t-n+1),(t-n+2)..(t-1)]
    for i in range(step_in,0,-1):
        cols.append(df.shift(i))
        names+=[f"{name}-({i})step" for name in df.columns]
    # 输出序列[t,(t+1),(t+2)...(t+n)]
    for i in range(0,step_out):
        cols.append(df.shift(-i))
        if i ==0:
            names+=[f"{name}+(0)step" for name in df.columns]
        else:
            names+=[f"{name}+({i})step" for name in df.columns]
    
    df_re = pd.concat(cols,axis=1)
    df_re.columns = names
    if dropnan:
        df_re.dropna(inplace=True)
    
    return df_re

data_step = series_to_supervisied_(data_train__1,step_in= 10,step_out=2,dropnan = True)
data_step

	节假日-(10)step	折扣-(10)step	销量-(10)step	year-(10)step	month-(10)step	day-(10)step	quarter-(10)step	weekofyear-(10)step	dayofweek-(10)step	weekend-(10)step	…	折扣+(1)step	销量+(1)step	year+(1)step	month+(1)step	day+(1)step	quarter+(1)step	weekofyear+(1)step	dayofweek+(1)step	weekend+(1)step	diff_1+(1)step
7658	0.0	0.0	36873.00	2018.0	1.0	11.0	1.0	2.0	4.0	0.0	…	0.0	44178.75	2018.0	1.0	22.0	1.0	4.0	1.0	0.0	18534.75
7791	0.0	0.0	25644.00	2018.0	1.0	12.0	1.0	2.0	5.0	0.0	…	0.0	28086.00	2018.0	1.0	23.0	1.0	4.0	2.0	0.0	-12804.00
8337	0.0	1.0	40890.00	2018.0	1.0	13.0	1.0	2.0	6.0	1.0	…	0.0	47835.00	2018.0	1.0	24.0	1.0	4.0	3.0	0.0	8580.60
8610	1.0	1.0	39254.40	2018.0	1.0	14.0	1.0	2.0	7.0	1.0	…	1.0	45384.00	2018.0	1.0	25.0	1.0	4.0	4.0	0.0	-6609.00
8821	0.0	1.0	51993.00	2018.0	1.0	15.0	1.0	3.0	1.0	0.0	…	1.0	25868.88	2018.0	1.0	26.0	1.0	4.0	5.0	0.0	-11847.12
…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…
186152	0.0	1.0	47619.00	2019.0	5.0	16.0	2.0	20.0	4.0	0.0	…	1.0	33075.00	2019.0	5.0	27.0	2.0	22.0	1.0	0.0	-23178.00
186569	0.0	1.0	56253.00	2019.0	5.0	17.0	2.0	20.0	5.0	0.0	…	1.0	37317.00	2019.0	5.0	28.0	2.0	22.0	2.0	0.0	-15560.82
187165	1.0	1.0	52877.82	2019.0	5.0	18.0	2.0	20.0	6.0	1.0	…	1.0	44652.00	2019.0	5.0	29.0	2.0	22.0	3.0	0.0	-24081.00
187391	0.0	1.0	68733.00	2019.0	5.0	19.0	2.0	20.0	7.0	1.0	…	1.0	42387.00	2019.0	5.0	30.0	2.0	22.0	4.0	0.0	18891.00
187962	0.0	0.0	23496.00	2019.0	5.0	20.0	2.0	21.0	1.0	0.0	…	1.0	39843.78	2019.0	5.0	31.0	2.0	22.0	5.0	0.0	3231.78

495 rows × 132 columns

list(data_step.columns)[-11:]

['节假日+(1)step',
 '折扣+(1)step',
 '销量+(1)step',
 'year+(1)step',
 'month+(1)step',
 'day+(1)step',
 'quarter+(1)step',
 'weekofyear+(1)step',
 'dayofweek+(1)step',
 'weekend+(1)step',
 'diff_1+(1)step']

y_columns = "销量+(1)step"
y = data_step[y_columns]
x = data_step[list(data_step.columns)[:-11]]

data_ = pd.concat([x,y],axis=1)
data_

	节假日-(10)step	折扣-(10)step	销量-(10)step	year-(10)step	month-(10)step	day-(10)step	quarter-(10)step	weekofyear-(10)step	dayofweek-(10)step	weekend-(10)step	…	销量+(0)step	year+(0)step	month+(0)step	day+(0)step	quarter+(0)step	weekofyear+(0)step	dayofweek+(0)step	weekend+(0)step	diff_1+(0)step	销量+(1)step
7658	0.0	0.0	36873.00	2018.0	1.0	11.0	1.0	2.0	4.0	0.0	…	44625.00	2018	1	21	1	3	7	1	7752.00	44178.75
7791	0.0	0.0	25644.00	2018.0	1.0	12.0	1.0	2.0	5.0	0.0	…	44178.75	2018	1	22	1	4	1	0	18534.75	28086.00
8337	0.0	1.0	40890.00	2018.0	1.0	13.0	1.0	2.0	6.0	1.0	…	28086.00	2018	1	23	1	4	2	0	-12804.00	47835.00
8610	1.0	1.0	39254.40	2018.0	1.0	14.0	1.0	2.0	7.0	1.0	…	47835.00	2018	1	24	1	4	3	0	8580.60	45384.00
8821	0.0	1.0	51993.00	2018.0	1.0	15.0	1.0	3.0	1.0	0.0	…	45384.00	2018	1	25	1	4	4	0	-6609.00	25868.88
…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…
186152	0.0	1.0	47619.00	2019.0	5.0	16.0	2.0	20.0	4.0	0.0	…	25035.00	2019	5	26	2	21	7	1	-22584.00	33075.00
186569	0.0	1.0	56253.00	2019.0	5.0	17.0	2.0	20.0	5.0	0.0	…	33075.00	2019	5	27	2	22	1	0	-23178.00	37317.00
187165	1.0	1.0	52877.82	2019.0	5.0	18.0	2.0	20.0	6.0	1.0	…	37317.00	2019	5	28	2	22	2	0	-15560.82	44652.00
187391	0.0	1.0	68733.00	2019.0	5.0	19.0	2.0	20.0	7.0	1.0	…	44652.00	2019	5	29	2	22	3	0	-24081.00	42387.00
187962	0.0	0.0	23496.00	2019.0	5.0	20.0	2.0	21.0	1.0	0.0	…	42387.00	2019	5	30	2	22	4	0	18891.00	39843.78

495 rows × 122 columns

划分训练集测试集

lens = -1
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
data_train_,data_test_ = train_test_split(data_,test_size=0.2,shuffle=False)
data_train_  = shuffle(data_train_,random_state=1412)
xtrain,ytrain = data_train_.iloc[:,:lens],data_train_.iloc[:,lens]
xtest,ytest = data_test_.iloc[:,:lens],data_test_.iloc[:,lens]

归一化

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(ytrain.values.reshape(-1,1))
y_train = scaler.transform(ytrain.values.reshape(-1,1))
y_test = scaler.transform(ytest.values.reshape(-1,1))

随机森林

from sklearn.ensemble import RandomForestRegressor
rf_clf = RandomForestRegressor(max_depth=12, 
                               min_impurity_decrease=0.0, 
                               n_estimators= 300)
rf_clf.fit(xtrain,ytrain)
rf_clf.score(xtest,ytest)

0.43699715354836455

from sklearn.metrics import mean_squared_error
mean_squared_error(ytest,rf_clf.predict(xtest))**0.5

10330.02670862774

mean_squared_error(ytrain,rf_clf.predict(xtrain))**0.5

3440.408502696517

def symmetric_mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.sum(np.abs(y_true - y_pred) * 2) / np.sum(np.abs(y_true) + np.abs(y_pred))

def prophet_smape(y_true, y_pred):
    smape_val = symmetric_mean_absolute_percentage_error(y_true, y_pred)
    return 'SMAPE', smape_val, False

 prophet_smape(ytrain,rf_clf.predict(xtrain))

('SMAPE', 0.07417421103653893, False)

 prophet_smape(ytest,rf_clf.predict(xtest))

xgboost

from xgboost import XGBRegressor
xgb_clf = XGBRegressor(max_depth=12, 
                               n_estimators=500)
xgb_clf.fit(xtrain,ytrain)
xgb_clf.score(xtest,ytest)

0.40055294233563

mean_squared_error(ytest,xgb_clf.predict(xtest))**0.5

10659.125252942618

mean_squared_error(ytrain,xgb_clf.predict(xtrain))**0.5

0.0036928046625203763

 prophet_smape(ytest,xgb_clf.predict(xtest))

('SMAPE', 0.22077657322671448, False)

 prophet_smape(ytrain,xgb_clf.predict(xtrain))

('SMAPE', 7.480844639369475e-08, False)

用一下贝叶斯优化看一下，是不是历史时间步长的问题?

from hyperopt import hp,fmin,tpe,Trials,partial
from hyperopt.early_stop import no_progress_loss


# 定义目标函数：
def hpyeropt_objective(params):

    data_step = series_to_supervisied_(data_train__1,step_in = int(params["time_step"]),step_out=2,dropnan = True)
    
    y_columns = "销量+(0)step"
    y = data_step[y_columns]
    x = data_step[list(data_step.columns)[:-20]]
    data_ = pd.concat([x,y],axis=1)
    
    from sklearn.model_selection import train_test_split
    from sklearn.utils import shuffle
    data_train,data_test = train_test_split(data_,test_size=0.2,shuffle=False)
    data_train  = shuffle(data_train,random_state=1412)
    xtrain,ytrain = data_train.iloc[:,:-20],data_train.iloc[:,-1]
    xtest,ytest = data_test.iloc[:,:-20],data_test.iloc[:,-1]
    
    # 归一化
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(ytrain.values.reshape(-1,1))
    ytrain = scaler.transform(ytrain.values.reshape(-1,1))
    ytest = scaler.transform(ytest.values.reshape(-1,1))

    
    clf = RandomForestRegressor(n_estimators=int(params["n_estimators"]),
                                max_depth=int(params["max_depth"]),
#                                max_features=int(min(params["max_features"],len(xtrain.columns))),
                               min_impurity_decrease=params["min_impurity_decrease"],
                               random_state=1412,
                               verbose=False,
                               n_jobs=-1
                               ).fit(xtrain,ytrain)
    
    scores = clf.score(xtest,ytest)
    return -scores   


#定义参数空间
params_grid = {"n_estimators":hp.quniform("n_estimators",10,1000,20),
                   "max_depth":hp.quniform("max_depth",5,25,1),
#                    "max_features":hp.quniform("max_features",10,10000,1),
                   "min_impurity_decrease":hp.quniform("min_impurity_decrease",0,5,1),
              
           
               "time_step":hp.quniform("time_step",10,200,5)
              }

#定义迭代
def param_hyperopt(max_evals = 100):
    trials = Trials()
    early_stop_fn =no_progress_loss(50)
    params_best = fmin(hpyeropt_objective,
                      space=params_grid,
                       algo=tpe.suggest,
                       max_evals=max_evals,
                       verbose=True,
                       trials=trials,
                       early_stop_fn=early_stop_fn
                      )
    print("\n","\n","best params:",params_best,"\n")
    return params_best,trials

params_best,trials =  param_hyperopt(max_evals = 300)

 34%|███████████████                             | 103/300 [02:59<05:43,  1.75s/trial, best loss: -0.48958559215129405]

best params: {‘max_depth’: 9.0, ‘min_impurity_decrease’: 0.0, ‘n_estimators’: 660.0, ‘time_step’: 10.0}

目前来讲的话，效果都不是很好，那么接下来从时间角度来想的话，在每个时间窗内进行差分以及其他的特征衍生,尽量的提取一些其他的特征！至此先考虑下另外一个方案。

data_train__1["销量"].diff()-data_train__1["销量"].diff()

	节假日	折扣	销量	year	month	day	quarter	weekofyear	dayofweek	weekend
0	1.0	0.0	-35357.16	0.0	0.0	-1.0	0.0	0.0	-1.0	0.0
607	0.0	0.0	-7668.00	0.0	0.0	-1.0	0.0	0.0	-1.0	0.0
1046	0.0	0.0	5640.00	0.0	0.0	-1.0	0.0	0.0	-1.0	0.0
1207	0.0	0.0	-3207.00	0.0	0.0	-1.0	0.0	0.0	-1.0	0.0
1752	0.0	1.0	23109.00	0.0	0.0	-1.0	0.0	0.0	-1.0	-1.0
…	…	…	…	…	…	…	…	…	…	…
186569	0.0	0.0	-4242.00	0.0	0.0	-1.0	0.0	0.0	-1.0	0.0
187165	0.0	0.0	-7335.00	0.0	0.0	-1.0	0.0	0.0	-1.0	0.0
187391	0.0	0.0	2265.00	0.0	0.0	-1.0	0.0	0.0	-1.0	0.0
187962	-1.0	0.0	2543.22	0.0	0.0	-1.0	0.0	0.0	-1.0	0.0
188113	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN