I am trying to run some time forecasting series on around 8 datasets.
they look like this:
from pyspark.sql.functions import col, lit
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
data = [
("2022-07-01", 11767424, 25.774193548387096, 1),
("2022-08-01", 13331928, 25.677419354838708, 1),
("2022-09-01", 11194711, 19.633333333333333, 1),
("2022-10-01", 11506759, 17.967741935483872, 1),
("2022-11-01", 9525865, 12.933333333333334, 1),
("2022-12-01", 8438520, 7.96774193548387, 1),
("2023-01-01", 8811170, 8.806451612903226, 1),
("2023-02-01", 11417707, 10.464285714285714, 1),
("2023-03-01", 12539421, 10.935483870967742, 1),
("2023-04-01", 10824295, 13.5, 1),
("2023-05-01", 11651067, 17.538709677419355, 1.1),
("2023-06-01", 12870035, 24.043333333333333, 1),
("2023-07-01", 10717095, 22.032258064516128, 1),
("2023-08-01", 8262566, 22.483870967741936, 1),
("2023-09-01", 7564720, 23.53333333333333, 1),
("2023-10-01", 8540128, 17.580645161290324, 1),
("2023-11-01", 9278635, 11.366666666666667, 1),
("2023-12-01", 9293826, 10.580645161290322, 1),
("2024-01-01", 9628144, 8.483870967741936, 1.1),
("2024-02-01", 10127209, 11.96551724137931, 1),
("2024-03-01", 11405640, 12.419354838709678, 1),
("2024-04-01", 16075499, 14.26666666666667, 1.1),
("2024-05-01", 14236947, 18.64516129032258, 1),
("2024-06-01", 13946271, 20.7, 1),
("2024-07-01", 12968261, 22.548387096774192, 1),
("2024-08-01", 11624620, 24, 1),
("2024-09-01", 9705773, 19.3, 1),
("2024-10-01", 11821238, 16.096774193548388, 1),
("2024-11-01", 11457334, 11.166666666666666, 1),
("2024-12-01", 10537240, 9.66451612903226, 1)
schema = StructType([
StructField("date", StringType(), True),
StructField("total_invoiced_volume", IntegerType(), True),
StructField("monthly_avg_tmax", DoubleType(), True),
StructField("Week_Weight", DoubleType(), True)
gro_monthly_ev = spark.createDataFrame(data, schema)
All these datasets are stored into a folder and created a series of forecasting algorithms to predict the sales in 3, 6, 9, 12 months.
this is the code that takes all the datasets and apply a time forecasting algorithms.
import pandas as pd
import os
import glob
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_percentage_error, r2_score
from tqdm import tqdm
import numpy as np
# Define directories
input_dir = "/Workspace/Users/gab/path_to_merged_data"
output_dir = "/Workspace/Users/gab/forecast_results"
os.makedirs(output_dir, exist_ok=True)
# List all monthly datasets
file_list = [file for file in glob.glob(os.path.join(input_dir, "*.csv")) if "monthly" in file]
# Initialize results storage
results = []
future_forecasts = []
# Forecasting horizons
forecast_horizons = [3, 6, 9, 12]
# Define models
models = {
"Decision Tree": DecisionTreeRegressor(max_depth=4, random_state=42),
"Random Forest": RandomForestRegressor(max_depth=4, random_state=42),
"XGBoost": XGBRegressor(max_depth=4, random_state=42),
"LightGBM": LGBMRegressor(random_state=42)
# Process each dataset
for file in file_list:
# Load dataset
dataset_name = os.path.basename(file)
df = pd.read_csv(file)
# Ensure no missing values
if df.isnull().sum().sum() > 0:
# Ensure date column is in datetime format
df["date"] = pd.to_datetime(df["date"])
# Process variables
target_col = "total_invoiced_volume"
df["monthly_avg_tmax"] = df["monthly_avg_tmax"].round(0).astype(int) # Convert decimals to integers for time series models
# Prepare train and test sets
df.sort_values(by="date", inplace=True)
test = df[df["date"] >= df["date"].max() - pd.DateOffset(months=3)]
train = df[df["date"] < df["date"].max() - pd.DateOffset(months=3)]
# Features and target
features = [col for col in df.columns if col not in ["date", target_col]]
X_train, y_train = train[features], train[target_col]
X_test, y_test = test[features], test[target_col]
# Train and forecast with each model
for model_name, model in tqdm(models.items(), desc=f"Processing {dataset_name}"):
# Train the model, y_train)
# Predict on test data
y_pred = model.predict(X_test)
# Calculate metrics
mape = mean_absolute_percentage_error(y_test, y_pred)
accuracy = r2_score(y_test, y_pred)
# Store results
for horizon in forecast_horizons:
future_dates = pd.date_range(start=df["date"].max(), periods=horizon + 1, freq="MS")[1:]
future_X = X_test.iloc[:min(horizon, len(X_test)), :] # Ensure future_X has enough rows
future_y = model.predict(future_X)
"model": model_name,
"dataset": dataset_name,
"horizon": horizon,
"mape": mape,
"accuracy": accuracy
# Store future forecast
for i, forecast_date in enumerate(future_dates[:len(future_y)]):
"model": model_name,
"forecast_date": forecast_date,
"forecast": future_y[i]
# Convert results to DataFrames
results_df = pd.DataFrame(results)
future_forecasts_df = pd.DataFrame(future_forecasts)
# Save results
results_file = os.path.join(output_dir, "forecast_results.csv")
future_forecasts_file = os.path.join(output_dir, "future_forecasts.csv")
results_df.to_csv(results_file, index=False)
future_forecasts_df.to_csv(future_forecasts_file, index=False)
print(f"Forecast results saved to {results_file}")
print(f"Future forecasts saved to {future_forecasts_file}")
and the outputs come with negative values:
model dataset horizon mape accuracy
XGBoost synthetic_dataset_1.csv 3 0.9345570675519763 -96.02492281747726
XGBoost synthetic_dataset_1.csv 6 0.6271307289552767 -70.91875294532318
XGBoost synthetic_dataset_1.csv 9 0.9917830267047274 -98.13387817239064
XGBoost synthetic_dataset_1.csv 12 1.8660074870488188 -78.53834752084283
LightGBM synthetic_dataset_1.csv 3 1.8211836390068303 -13.443149016458591
LightGBM synthetic_dataset_1.csv 6 0.6815504925684449 -155.64855098317562
LightGBM synthetic_dataset_1.csv 9 0.10735314290676172 -114.71200513051166
LightGBM synthetic_dataset_1.csv 12 1.2607188837495213 -79.2995414610846
Decision Tree synthetic_dataset_1.csv 3 1.086431157027433 -37.56293565488065
Decision Tree synthetic_dataset_1.csv 6 0.24353449685825834 -195.8726668013846
Decision Tree synthetic_dataset_1.csv 9 1.0238986600195292 -159.2508264827403
Decision Tree synthetic_dataset_1.csv 12 1.8688982557030902 -16.728460706610804
Random Forest synthetic_dataset_1.csv 3 0.9468312070925308 -171.5793613682497
Random Forest synthetic_dataset_1.csv 6 1.2874845795631709 -46.06559850787815
Random Forest synthetic_dataset_1.csv 9 0.8402783930768509 -63.063988951879764
Random Forest synthetic_dataset_1.csv 12 0.6203818639940742 -49.10659016208899
Can I understand why it does what it does and how to solve it?
本文标签: pythonWhy do I get negative accuracy rate for time forecasting and how to solve itStack Overflow
版权声明:本文标题:python - Why do I get negative accuracy rate for time forecasting and how to solve it? - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。