admin管理员组文章数量:1188851
I am trying to run some time forecasting series on around 8 datasets.
they look like this:
from pyspark.sql.functions import col, lit
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
data = [
("2022-07-01", 11767424, 25.774193548387096, 1),
("2022-08-01", 13331928, 25.677419354838708, 1),
("2022-09-01", 11194711, 19.633333333333333, 1),
("2022-10-01", 11506759, 17.967741935483872, 1),
("2022-11-01", 9525865, 12.933333333333334, 1),
("2022-12-01", 8438520, 7.96774193548387, 1),
("2023-01-01", 8811170, 8.806451612903226, 1),
("2023-02-01", 11417707, 10.464285714285714, 1),
("2023-03-01", 12539421, 10.935483870967742, 1),
("2023-04-01", 10824295, 13.5, 1),
("2023-05-01", 11651067, 17.538709677419355, 1.1),
("2023-06-01", 12870035, 24.043333333333333, 1),
("2023-07-01", 10717095, 22.032258064516128, 1),
("2023-08-01", 8262566, 22.483870967741936, 1),
("2023-09-01", 7564720, 23.53333333333333, 1),
("2023-10-01", 8540128, 17.580645161290324, 1),
("2023-11-01", 9278635, 11.366666666666667, 1),
("2023-12-01", 9293826, 10.580645161290322, 1),
("2024-01-01", 9628144, 8.483870967741936, 1.1),
("2024-02-01", 10127209, 11.96551724137931, 1),
("2024-03-01", 11405640, 12.419354838709678, 1),
("2024-04-01", 16075499, 14.26666666666667, 1.1),
("2024-05-01", 14236947, 18.64516129032258, 1),
("2024-06-01", 13946271, 20.7, 1),
("2024-07-01", 12968261, 22.548387096774192, 1),
("2024-08-01", 11624620, 24, 1),
("2024-09-01", 9705773, 19.3, 1),
("2024-10-01", 11821238, 16.096774193548388, 1),
("2024-11-01", 11457334, 11.166666666666666, 1),
("2024-12-01", 10537240, 9.66451612903226, 1)
]
schema = StructType([
StructField("date", StringType(), True),
StructField("total_invoiced_volume", IntegerType(), True),
StructField("monthly_avg_tmax", DoubleType(), True),
StructField("Week_Weight", DoubleType(), True)
])
gro_monthly_ev = spark.createDataFrame(data, schema)
display(gro_monthly_ev)
All these datasets are stored into a folder and created a series of forecasting algorithms to predict the sales in 3, 6, 9, 12 months.
this is the code that takes all the datasets and apply a time forecasting algorithms.
%python
import pandas as pd
import os
import glob
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_percentage_error, r2_score
from tqdm import tqdm
import numpy as np
# Define directories
input_dir = "/Workspace/Users/gab/path_to_merged_data"
output_dir = "/Workspace/Users/gab/forecast_results"
os.makedirs(output_dir, exist_ok=True)
# List all monthly datasets
file_list = [file for file in glob.glob(os.path.join(input_dir, "*.csv")) if "monthly" in file]
# Initialize results storage
results = []
future_forecasts = []
# Forecasting horizons
forecast_horizons = [3, 6, 9, 12]
# Define models
models = {
"Decision Tree": DecisionTreeRegressor(max_depth=4, random_state=42),
"Random Forest": RandomForestRegressor(max_depth=4, random_state=42),
"XGBoost": XGBRegressor(max_depth=4, random_state=42),
"LightGBM": LGBMRegressor(random_state=42)
}
# Process each dataset
for file in file_list:
# Load dataset
dataset_name = os.path.basename(file)
df = pd.read_csv(file)
# Ensure no missing values
if df.isnull().sum().sum() > 0:
continue
# Ensure date column is in datetime format
df["date"] = pd.to_datetime(df["date"])
# Process variables
target_col = "total_invoiced_volume"
df["monthly_avg_tmax"] = df["monthly_avg_tmax"].round(0).astype(int) # Convert decimals to integers for time series models
# Prepare train and test sets
df.sort_values(by="date", inplace=True)
test = df[df["date"] >= df["date"].max() - pd.DateOffset(months=3)]
train = df[df["date"] < df["date"].max() - pd.DateOffset(months=3)]
# Features and target
features = [col for col in df.columns if col not in ["date", target_col]]
X_train, y_train = train[features], train[target_col]
X_test, y_test = test[features], test[target_col]
# Train and forecast with each model
for model_name, model in tqdm(models.items(), desc=f"Processing {dataset_name}"):
# Train the model
model.fit(X_train, y_train)
# Predict on test data
y_pred = model.predict(X_test)
# Calculate metrics
mape = mean_absolute_percentage_error(y_test, y_pred)
accuracy = r2_score(y_test, y_pred)
# Store results
for horizon in forecast_horizons:
future_dates = pd.date_range(start=df["date"].max(), periods=horizon + 1, freq="MS")[1:]
future_X = X_test.iloc[:min(horizon, len(X_test)), :] # Ensure future_X has enough rows
future_y = model.predict(future_X)
results.append({
"model": model_name,
"dataset": dataset_name,
"horizon": horizon,
"mape": mape,
"accuracy": accuracy
})
# Store future forecast
for i, forecast_date in enumerate(future_dates[:len(future_y)]):
future_forecasts.append({
"model": model_name,
"forecast_date": forecast_date,
"forecast": future_y[i]
})
# Convert results to DataFrames
results_df = pd.DataFrame(results)
future_forecasts_df = pd.DataFrame(future_forecasts)
# Save results
results_file = os.path.join(output_dir, "forecast_results.csv")
future_forecasts_file = os.path.join(output_dir, "future_forecasts.csv")
results_df.to_csv(results_file, index=False)
future_forecasts_df.to_csv(future_forecasts_file, index=False)
print(f"Forecast results saved to {results_file}")
print(f"Future forecasts saved to {future_forecasts_file}")
and the outputs come with negative values:
model dataset horizon mape accuracy
XGBoost synthetic_dataset_1.csv 3 0.9345570675519763 -96.02492281747726
XGBoost synthetic_dataset_1.csv 6 0.6271307289552767 -70.91875294532318
XGBoost synthetic_dataset_1.csv 9 0.9917830267047274 -98.13387817239064
XGBoost synthetic_dataset_1.csv 12 1.8660074870488188 -78.53834752084283
LightGBM synthetic_dataset_1.csv 3 1.8211836390068303 -13.443149016458591
LightGBM synthetic_dataset_1.csv 6 0.6815504925684449 -155.64855098317562
LightGBM synthetic_dataset_1.csv 9 0.10735314290676172 -114.71200513051166
LightGBM synthetic_dataset_1.csv 12 1.2607188837495213 -79.2995414610846
Decision Tree synthetic_dataset_1.csv 3 1.086431157027433 -37.56293565488065
Decision Tree synthetic_dataset_1.csv 6 0.24353449685825834 -195.8726668013846
Decision Tree synthetic_dataset_1.csv 9 1.0238986600195292 -159.2508264827403
Decision Tree synthetic_dataset_1.csv 12 1.8688982557030902 -16.728460706610804
Random Forest synthetic_dataset_1.csv 3 0.9468312070925308 -171.5793613682497
Random Forest synthetic_dataset_1.csv 6 1.2874845795631709 -46.06559850787815
Random Forest synthetic_dataset_1.csv 9 0.8402783930768509 -63.063988951879764
Random Forest synthetic_dataset_1.csv 12 0.6203818639940742 -49.10659016208899
Can I understand why it does what it does and how to solve it?
本文标签: pythonWhy do I get negative accuracy rate for time forecasting and how to solve itStack Overflow
版权声明:本文标题:python - Why do I get negative accuracy rate for time forecasting and how to solve it? - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1738398545a2084644.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论