admin管理员组

文章数量:1122832

i am pulling from a db orders for 2 customers going back to 2020

would like to predict their volume for the next 3 months

i have the following code, it sort of works its predicting the combined volume but its not separating results by each customer id - and i am hoping someone can help guide me in the right direction

import sqlalchemy
import pandas as pd
import numpy as n

################### FUNCTIONS #####################

def getSQLData(sql):

    '''
        sql_string = sql
        DRIVER_NAME = 'SQL SERVER'
        SERVER_NAME = 'server'
        DATABASE_NAME = database
        connection_string = f"""
            DRIVER={{{DRIVER_NAME}}};
            SERVER={SERVER_NAME};
            DATABASE={DATABASE_NAME};
            Trust_Connection=yes;"""
    '''
    conn = sqlalchemy.create_engine('mssql://server/Master?trusted_connection=yes&driver=ODBC+Driver+17+for+SQL+Server')

    df = pd.read_sql(sql,conn)

    return df

def getTDSQLData(sql):
 
    conn = sqlalchemy.create_engine('teradatasql://server/?user=username&password=password')

    df = pd.read_sql(sql,conn)

    return df

        
################## DATA FILES #####################

# total revenue and volume
customer_volume_sql = '''
      SELECT 
            "SALES_ORDER_DT",
            "CUSTOMER_ID",
            sum("VOLUME") as "VOLUME",
            sum("net_revenue_amt") as "REVENUE"
        FROM "db"."tbl"
        where CUSTOMER_ID in ('12345', '67898') and
             "SALES_ORDER_DT" between '2020-01-01' and '2024-11-19'
        group by SALES_ORDER_DT, 
            CUSTOMER_ID;

    '''
customer_volume_data = getTDSQLData(customer_volume_sql)

# holiday list
holidays_sql = '''
        SELECT [Date]
        FROM [db].[dbo].[tbl]
    '''
holidays_data = getSQLData(holidays_sql)


# list of weekend days
weekends_sql = '''
         SELECT "CALENDAR_DATE",
            "WEEKEND_FLG",
            "SEASON_DESC_EN",
            "NATIONAL_HOLIDAY_IND"
        FROM "db"."tbl"
    '''
weekends_data = getTDSQLData(weekends_sql)

# list of dates to predict
prediction_dates = pd.read_excel('C:/Users/user/Desktop/prediction_empty2.xlsx', parse_dates = True)


################## CLEANUP / PREP #####################


# convert dates
customer_volume_data['Date'] = pd.to_datetime(customer_volume_data['SALES_ORDER_DT']).dt.date
holidays_data['Date'] = pd.to_datetime(holidays_data['Date']).dt.date
weekends_data['Date'] = pd.to_datetime(weekends_data['CALENDAR_DATE']).dt.date


# remove holidays
customer_volume_data = customer_volume_data.merge(holidays_data.drop_duplicates(), on=['Date'], 
                   how='left', indicator=True)
customer_volume_data=customer_volume_data[customer_volume_data['_merge']=="left_only"]
customer_volume_data = customer_volume_data.drop('_merge', axis=1)
customer_volume_data


# summarize data
customer_volume_data=customer_volume_data.groupby(['Date', 'CUSTOMER_ID'])['VOLUME'].sum().reset_index()


# create features
customer_volume_data = customer_volume_data.merge(weekends_data.drop_duplicates(), on=['Date'], 
                   how='left', indicator=True)

customer_volume_data['Year'] = pd.to_datetime(customer_volume_data['Date']).dt.year
customer_volume_data['Week'] = pd.to_datetime(customer_volume_data['Date']).dt.isocalendar().week
customer_volume_data['Day'] = pd.to_datetime(customer_volume_data['Date']).dt.day
customer_volume_data['WeekDay'] = pd.to_datetime(customer_volume_data['Date']).dt.dayofweek

prediction_dates['Year'] = pd.to_datetime(prediction_dates['Date']).dt.year
prediction_dates['Week'] = pd.to_datetime(prediction_dates['Date']).dt.isocalendar().week
prediction_dates['Day'] = pd.to_datetime(prediction_dates['Date']).dt.day
prediction_dates['WeekDay'] = pd.to_datetime(prediction_dates['Date']).dt.dayofweek

customer_volume_data = customer_volume_data.drop(['_merge', 'SEASON_DESC_EN', 'WEEKEND_FLG', 'CALENDAR_DATE', 'NATIONAL_HOLIDAY_IND'], axis=1)

################## PREDICT #####################

#import train_test_split
from sklearn.model_selection import train_test_split

# select algorithm
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=0)   

# customers to loop through
unique_accounts=customer_volume_data.groupby(['CUSTOMER_ID'])['VOLUME'].sum().reset_index()
unique_accounts = unique_accounts.drop(['VOLUME'], axis=1)

test1 = prediction_dates.drop(['Date', 'Prediction'], axis=1)

for index, row in unique_accounts.iterrows():

    # split data into train, test
    predictors = customer_volume_data.drop(['Date', 'VOLUME'], axis=1)
    target = customer_volume_data['VOLUME']
    x_train, x_cv, y_train, y_cv=train_test_split(predictors, target, test_size=0.3, random_state=100, stratify=customer_volume_data[['CUSTOMER_ID']])

    # fit, predict and store results
    model.fit(x_train, y_train)
    pred=model.predict(x_cv)
   
    pred2=model.predict(test1)
    prediction_dates['Prediction']=pred2.round(0)

    result = prediction_dates[['Date', 'Prediction', 'CUSTOMER_ID']]
    print(result)

this gives me the following output - the problem is that the customer_id is not populated and for both occurances it predicted the same numbers of the same days, i am expecting to see different values for each customer_id

any help is appritiated

本文标签: pythonstuck on traintestsplit and a for loopStack Overflow