admin管理员组文章数量:1122832
i am pulling from a db orders for 2 customers going back to 2020
would like to predict their volume for the next 3 months
i have the following code, it sort of works its predicting the combined volume but its not separating results by each customer id - and i am hoping someone can help guide me in the right direction
import sqlalchemy
import pandas as pd
import numpy as n
################### FUNCTIONS #####################
def getSQLData(sql):
'''
sql_string = sql
DRIVER_NAME = 'SQL SERVER'
SERVER_NAME = 'server'
DATABASE_NAME = database
connection_string = f"""
DRIVER={{{DRIVER_NAME}}};
SERVER={SERVER_NAME};
DATABASE={DATABASE_NAME};
Trust_Connection=yes;"""
'''
conn = sqlalchemy.create_engine('mssql://server/Master?trusted_connection=yes&driver=ODBC+Driver+17+for+SQL+Server')
df = pd.read_sql(sql,conn)
return df
def getTDSQLData(sql):
conn = sqlalchemy.create_engine('teradatasql://server/?user=username&password=password')
df = pd.read_sql(sql,conn)
return df
################## DATA FILES #####################
# total revenue and volume
customer_volume_sql = '''
SELECT
"SALES_ORDER_DT",
"CUSTOMER_ID",
sum("VOLUME") as "VOLUME",
sum("net_revenue_amt") as "REVENUE"
FROM "db"."tbl"
where CUSTOMER_ID in ('12345', '67898') and
"SALES_ORDER_DT" between '2020-01-01' and '2024-11-19'
group by SALES_ORDER_DT,
CUSTOMER_ID;
'''
customer_volume_data = getTDSQLData(customer_volume_sql)
# holiday list
holidays_sql = '''
SELECT [Date]
FROM [db].[dbo].[tbl]
'''
holidays_data = getSQLData(holidays_sql)
# list of weekend days
weekends_sql = '''
SELECT "CALENDAR_DATE",
"WEEKEND_FLG",
"SEASON_DESC_EN",
"NATIONAL_HOLIDAY_IND"
FROM "db"."tbl"
'''
weekends_data = getTDSQLData(weekends_sql)
# list of dates to predict
prediction_dates = pd.read_excel('C:/Users/user/Desktop/prediction_empty2.xlsx', parse_dates = True)
################## CLEANUP / PREP #####################
# convert dates
customer_volume_data['Date'] = pd.to_datetime(customer_volume_data['SALES_ORDER_DT']).dt.date
holidays_data['Date'] = pd.to_datetime(holidays_data['Date']).dt.date
weekends_data['Date'] = pd.to_datetime(weekends_data['CALENDAR_DATE']).dt.date
# remove holidays
customer_volume_data = customer_volume_data.merge(holidays_data.drop_duplicates(), on=['Date'],
how='left', indicator=True)
customer_volume_data=customer_volume_data[customer_volume_data['_merge']=="left_only"]
customer_volume_data = customer_volume_data.drop('_merge', axis=1)
customer_volume_data
# summarize data
customer_volume_data=customer_volume_data.groupby(['Date', 'CUSTOMER_ID'])['VOLUME'].sum().reset_index()
# create features
customer_volume_data = customer_volume_data.merge(weekends_data.drop_duplicates(), on=['Date'],
how='left', indicator=True)
customer_volume_data['Year'] = pd.to_datetime(customer_volume_data['Date']).dt.year
customer_volume_data['Week'] = pd.to_datetime(customer_volume_data['Date']).dt.isocalendar().week
customer_volume_data['Day'] = pd.to_datetime(customer_volume_data['Date']).dt.day
customer_volume_data['WeekDay'] = pd.to_datetime(customer_volume_data['Date']).dt.dayofweek
prediction_dates['Year'] = pd.to_datetime(prediction_dates['Date']).dt.year
prediction_dates['Week'] = pd.to_datetime(prediction_dates['Date']).dt.isocalendar().week
prediction_dates['Day'] = pd.to_datetime(prediction_dates['Date']).dt.day
prediction_dates['WeekDay'] = pd.to_datetime(prediction_dates['Date']).dt.dayofweek
customer_volume_data = customer_volume_data.drop(['_merge', 'SEASON_DESC_EN', 'WEEKEND_FLG', 'CALENDAR_DATE', 'NATIONAL_HOLIDAY_IND'], axis=1)
################## PREDICT #####################
#import train_test_split
from sklearn.model_selection import train_test_split
# select algorithm
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=0)
# customers to loop through
unique_accounts=customer_volume_data.groupby(['CUSTOMER_ID'])['VOLUME'].sum().reset_index()
unique_accounts = unique_accounts.drop(['VOLUME'], axis=1)
test1 = prediction_dates.drop(['Date', 'Prediction'], axis=1)
for index, row in unique_accounts.iterrows():
# split data into train, test
predictors = customer_volume_data.drop(['Date', 'VOLUME'], axis=1)
target = customer_volume_data['VOLUME']
x_train, x_cv, y_train, y_cv=train_test_split(predictors, target, test_size=0.3, random_state=100, stratify=customer_volume_data[['CUSTOMER_ID']])
# fit, predict and store results
model.fit(x_train, y_train)
pred=model.predict(x_cv)
pred2=model.predict(test1)
prediction_dates['Prediction']=pred2.round(0)
result = prediction_dates[['Date', 'Prediction', 'CUSTOMER_ID']]
print(result)
this gives me the following output - the problem is that the customer_id is not populated and for both occurances it predicted the same numbers of the same days, i am expecting to see different values for each customer_id
any help is appritiated
本文标签: pythonstuck on traintestsplit and a for loopStack Overflow
版权声明:本文标题:python - stuck on train_test_split and a for loop - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1736310384a1934357.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论