admin管理员组

文章数量:1188851

I am trying to run some time forecasting series on around 8 datasets.

they look like this:

from pyspark.sql.functions import col, lit
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType

data = [
    ("2022-07-01", 11767424, 25.774193548387096, 1),
    ("2022-08-01", 13331928, 25.677419354838708, 1),
    ("2022-09-01", 11194711, 19.633333333333333, 1),
    ("2022-10-01", 11506759, 17.967741935483872, 1),
    ("2022-11-01", 9525865, 12.933333333333334, 1),
    ("2022-12-01", 8438520, 7.96774193548387, 1),
    ("2023-01-01", 8811170, 8.806451612903226, 1),
    ("2023-02-01", 11417707, 10.464285714285714, 1),
    ("2023-03-01", 12539421, 10.935483870967742, 1),
    ("2023-04-01", 10824295, 13.5, 1),
    ("2023-05-01", 11651067, 17.538709677419355, 1.1),
    ("2023-06-01", 12870035, 24.043333333333333, 1),
    ("2023-07-01", 10717095, 22.032258064516128, 1),
    ("2023-08-01", 8262566, 22.483870967741936, 1),
    ("2023-09-01", 7564720, 23.53333333333333, 1),
    ("2023-10-01", 8540128, 17.580645161290324, 1),
    ("2023-11-01", 9278635, 11.366666666666667, 1),
    ("2023-12-01", 9293826, 10.580645161290322, 1),
    ("2024-01-01", 9628144, 8.483870967741936, 1.1),
    ("2024-02-01", 10127209, 11.96551724137931, 1),
    ("2024-03-01", 11405640, 12.419354838709678, 1),
    ("2024-04-01", 16075499, 14.26666666666667, 1.1),
    ("2024-05-01", 14236947, 18.64516129032258, 1),
    ("2024-06-01", 13946271, 20.7, 1),
    ("2024-07-01", 12968261, 22.548387096774192, 1),
    ("2024-08-01", 11624620, 24, 1),
    ("2024-09-01", 9705773, 19.3, 1),
    ("2024-10-01", 11821238, 16.096774193548388, 1),
    ("2024-11-01", 11457334, 11.166666666666666, 1),
    ("2024-12-01", 10537240, 9.66451612903226, 1)
]

schema = StructType([
    StructField("date", StringType(), True),
    StructField("total_invoiced_volume", IntegerType(), True),
    StructField("monthly_avg_tmax", DoubleType(), True),
    StructField("Week_Weight", DoubleType(), True)
])

gro_monthly_ev = spark.createDataFrame(data, schema)

display(gro_monthly_ev)

All these datasets are stored into a folder and created a series of forecasting algorithms to predict the sales in 3, 6, 9, 12 months.

this is the code that takes all the datasets and apply a time forecasting algorithms.

%python
import pandas as pd
import os
import glob
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_percentage_error, r2_score
from tqdm import tqdm
import numpy as np

# Define directories
input_dir = "/Workspace/Users/gab/path_to_merged_data"
output_dir = "/Workspace/Users/gab/forecast_results"
os.makedirs(output_dir, exist_ok=True)

# List all monthly datasets
file_list = [file for file in glob.glob(os.path.join(input_dir, "*.csv")) if "monthly" in file]

# Initialize results storage
results = []
future_forecasts = []

# Forecasting horizons
forecast_horizons = [3, 6, 9, 12]

# Define models
models = {
   "Decision Tree": DecisionTreeRegressor(max_depth=4, random_state=42),
   "Random Forest": RandomForestRegressor(max_depth=4, random_state=42),
   "XGBoost": XGBRegressor(max_depth=4, random_state=42),
   "LightGBM": LGBMRegressor(random_state=42)
}

# Process each dataset
for file in file_list:
   # Load dataset
   dataset_name = os.path.basename(file)
   df = pd.read_csv(file)
   
   # Ensure no missing values
   if df.isnull().sum().sum() > 0:
       continue
   
   # Ensure date column is in datetime format
   df["date"] = pd.to_datetime(df["date"])
   
   # Process variables
   target_col = "total_invoiced_volume"
   df["monthly_avg_tmax"] = df["monthly_avg_tmax"].round(0).astype(int)  # Convert decimals to integers for time series models
   
   # Prepare train and test sets
   df.sort_values(by="date", inplace=True)
   test = df[df["date"] >= df["date"].max() - pd.DateOffset(months=3)]
   train = df[df["date"] < df["date"].max() - pd.DateOffset(months=3)]
   
   # Features and target
   features = [col for col in df.columns if col not in ["date", target_col]]
   X_train, y_train = train[features], train[target_col]
   X_test, y_test = test[features], test[target_col]
   
   # Train and forecast with each model
   for model_name, model in tqdm(models.items(), desc=f"Processing {dataset_name}"):
       # Train the model
       model.fit(X_train, y_train)
       
       # Predict on test data
       y_pred = model.predict(X_test)
       
       # Calculate metrics
       mape = mean_absolute_percentage_error(y_test, y_pred)
       accuracy = r2_score(y_test, y_pred)
       
       # Store results
       for horizon in forecast_horizons:
           future_dates = pd.date_range(start=df["date"].max(), periods=horizon + 1, freq="MS")[1:]
           future_X = X_test.iloc[:min(horizon, len(X_test)), :]  # Ensure future_X has enough rows
           future_y = model.predict(future_X)
           
           results.append({
               "model": model_name,
               "dataset": dataset_name,
               "horizon": horizon,
               "mape": mape,
               "accuracy": accuracy
           })
           
           # Store future forecast
           for i, forecast_date in enumerate(future_dates[:len(future_y)]):
               future_forecasts.append({
                   "model": model_name,
                   "forecast_date": forecast_date,
                   "forecast": future_y[i]
               })

# Convert results to DataFrames
results_df = pd.DataFrame(results)
future_forecasts_df = pd.DataFrame(future_forecasts)

# Save results
results_file = os.path.join(output_dir, "forecast_results.csv")
future_forecasts_file = os.path.join(output_dir, "future_forecasts.csv")
results_df.to_csv(results_file, index=False)
future_forecasts_df.to_csv(future_forecasts_file, index=False)

print(f"Forecast results saved to {results_file}")
print(f"Future forecasts saved to {future_forecasts_file}")

and the outputs come with negative values:

model   dataset horizon mape    accuracy
XGBoost synthetic_dataset_1.csv 3   0.9345570675519763  -96.02492281747726
XGBoost synthetic_dataset_1.csv 6   0.6271307289552767  -70.91875294532318
XGBoost synthetic_dataset_1.csv 9   0.9917830267047274  -98.13387817239064
XGBoost synthetic_dataset_1.csv 12  1.8660074870488188  -78.53834752084283
LightGBM    synthetic_dataset_1.csv 3   1.8211836390068303  -13.443149016458591
LightGBM    synthetic_dataset_1.csv 6   0.6815504925684449  -155.64855098317562
LightGBM    synthetic_dataset_1.csv 9   0.10735314290676172 -114.71200513051166
LightGBM    synthetic_dataset_1.csv 12  1.2607188837495213  -79.2995414610846
Decision Tree   synthetic_dataset_1.csv 3   1.086431157027433   -37.56293565488065
Decision Tree   synthetic_dataset_1.csv 6   0.24353449685825834 -195.8726668013846
Decision Tree   synthetic_dataset_1.csv 9   1.0238986600195292  -159.2508264827403
Decision Tree   synthetic_dataset_1.csv 12  1.8688982557030902  -16.728460706610804
Random Forest   synthetic_dataset_1.csv 3   0.9468312070925308  -171.5793613682497
Random Forest   synthetic_dataset_1.csv 6   1.2874845795631709  -46.06559850787815
Random Forest   synthetic_dataset_1.csv 9   0.8402783930768509  -63.063988951879764
Random Forest   synthetic_dataset_1.csv 12  0.6203818639940742  -49.10659016208899

Can I understand why it does what it does and how to solve it?

本文标签: pythonWhy do I get negative accuracy rate for time forecasting and how to solve itStack Overflow