amazon sagemaker - Training script unable to load preprocessing model - Stack Overflow-软件玩家

admin管理员组
文章数量:1325233

I am new to Sagemaker, I am trying to create inference pipeline and for that I am creating two models one for preprocessing and another one for training. I am using SKLearn to create the both of those jobs. For the preprocessor I have a preprocess script that includes basic data transformation and functions required for inference. At the end of this job the preprocess.joblib is saved to s3. Until this part the code is running without issues and the model.joblib is saved to /opt/ml/model/. The next step, is to train a model and for that I have a training script, which load the model and inside the main function, when the model is loaded, it shows me an error that the model does not exist.

here is both the preprocessing and training scripts:

---- preprocessing script


    from sklearnpose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler
    import pandas as pd
    import joblib
    import os
    
    # Function to create a preprocessing pipeline
    def preprocess_data():
        numeric_features = ['age', 'inactivity', 'recency', 'frequency']
        numeric_transformer = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
    
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features)
            ]
        )
        return preprocessor
    
    if __name__ == "__main__":
        print("[INFO] Loading and preprocessing training data...")
        
        # Paths for SageMaker environment variables
        input_dir = os.environ.get("SM_CHANNEL_TRAIN")
        output_dir = os.environ.get("SM_MODEL_DIR")
    
        # Combine input files
        data_files = [os.path.join(input_dir, file) for file in os.listdir(input_dir)]
        train_data = pd.concat([pd.read_csv(file) for file in data_files])
    
        # Drop the target column and preprocess only features
        target_column = 'Risk'
        predefined_features = ['age', 'inactivity', 'recency', 'frequency']
        features_data = train_data[predefined_features]
    
        # Fit the preprocessing pipeline
        preprocessor = preprocess_data()
        preprocessor.fit(features_data)
        
        # Save the preprocessor model
        joblib.dump(preprocessor, os.path.join(output_dir, "preprocessor.joblib"))
        print("[INFO] Preprocessing model saved successfully as preprocessor.joblib.")

--- Training script


    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, classification_report
    import pandas as pd
    import joblib
    import os
    import argparse
    
    # Function to load a pre-trained model
    def load_preprocessor(model_dir):
        preprocessor_path = os.path.join(model_dir, "preprocessor.joblib")
        if os.path.exists(preprocessor_path):
            print(f"[INFO] Loading preprocessor from {preprocessor_path}")
            return joblib.load(preprocessor_path)
        else:
            raise FileNotFoundError(f"[ERROR] Preprocessor artifact not found at {preprocessor_path}")
    
    if __name__ == "__main__":
        print("[INFO] Parsing arguments...")
        parser = argparse.ArgumentParser()
        parser.add_argument("--n_estimators", type=int, default=100)
        parser.add_argument("--random_state", type=int, default=42)
        parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
        parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
        parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
        parser.add_argument("--train-file", type=str, default="train.csv")
        parser.add_argument("--test-file", type=str, default="test.csv")
        args = parser.parse_args()
    
        print("[INFO] Loading preprocessor...")
        preprocessor = load_preprocessor(args.model_dir)
    
        print("[INFO] Reading training and testing data...")
        train_df = pd.read_csv(os.path.join(args.train, args.train_file))
        test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
        # Define features and labels
        features = ['age', 'inactivity', 'recency', 'frequency']
        label = 'Risk'
    
        # Preprocess training and testing data
        X_train = preprocessor.transform(train_df[features])
        y_train = train_df[label]
        X_test = preprocessor.transform(test_df[features])
        y_test = test_df[label]
    
        print("[INFO] Training Random Forest model...")
        model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state)
        model.fit(X_train, y_train)
    
        # Save the trained model
        model_path = os.path.join(args.model_dir, "random_forest_model.joblib")
        joblib.dump(model, model_path)
        print(f"[INFO] Trained model saved as random_forest_model.joblib at {model_path}")
    
        print("[INFO] Evaluating model on test data...")
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
    
        print("Accuracy:", acc)
        print("Classification Report:\n", report)

What could be the issue?

本文标签： amazon sagemakerTraining script unable to load preprocessing modelStack Overflow

版权声明：本文标题：amazon sagemaker - Training script unable to load preprocessing model - Stack Overflow 内容由网友自发贡献，该文观点仅代表作者本人，转载请联系作者并注明出处：http://www.betaflare.com/web/1742080193a2419652.html，本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容，一经查实，本站将立刻删除。

编程频道|软件玩家 - 软件改变生活！

amazon sagemaker - Training script unable to load preprocessing model - Stack Overflow

更多相关文章