admin管理员组

文章数量:1405375

I trained a tabPFN model, which I then tried applying a sequential feature selector for important feature selection. I've been getting this error

KeyError(f"None of [{key}] are in the [{axis_name}]")

on the following code:

Offending code:

from tabpfn_extensions import interpretability
from sklearn.feature_selection import SequentialFeatureSelector

try:
    sfs = interpretability.feature_selection.feature_selection(
        estimator=PFN,
        X=X_train,
        y=y_train,
        n_features_to_select=5,
        feature_names=category_column_indexes,
        error_score='raise'
    )
except KeyError as e:
    print("KeyError:", e)
    print("X_train.columns:", X_train.columns)
    print("category_column_indexes:", category_column_indexes)

Full error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[5], line 33
      4 # category_column_names = [X_train.columns[i] for i in category_column_indexes]
      5 
      6 # X_train.head()
   (...)
     27 #     X_train_selected.columns[i] for i in filtered_cat_indices
     28 # ]
     32 try:
---> 33     sfs = interpretability.feature_selection.feature_selection(
     34         estimator=PFN,
     35         X=X_train,
     36         y=y_train,
     37         n_features_to_select=5,
     38         feature_names=category_column_indexes,
     39         error_score='raise'
     40     )
     41 except KeyError as e:
     42     print("KeyError:", e)

File /tabpfn-extensions/src/tabpfn_extensions/interpretability/feature_selection.py:29, in feature_selection(estimator, X, y, n_features_to_select, feature_names, **kwargs)
     27         estimator.show_progress = show_progress_
     28 else:
---> 29     return _feature_selection(
     30         estimator, X, y, n_features_to_select, feature_names, **kwargs
     31     )

File /tabpfn-extensions/src/tabpfn_extensions/interpretability/feature_selection.py:50, in _feature_selection(estimator, X, y, n_features_to_select, feature_names, **kwargs)
     46 # TODO: Feature selection is done without CV, i.e. final CV scores might be biased (too good)
     47 sfs = SequentialFeatureSelector(
     48     estimator, n_features_to_select=n_features_to_select, direction="forward"
     49 )
---> 50 sfs.fit(X, y)
     51 sfs.get_support()
     52 X_transformed = sfs.transform(X)

File ~/miniconda3/envs/term2/lib/python3.12/site-packages/sklearn/base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1466     estimator._validate_params()
   1468 with config_context(
   1469     skip_parameter_validation=(
   1470         prefer_skip_nested_validation or global_skip_validation
   1471     )
   1472 ):
-> 1473     return fit_method(estimator, *args, **kwargs)

File ~/miniconda3/envs/term2/lib/python3.12/site-packages/sklearn/feature_selection/_sequential.py:255, in SequentialFeatureSelector.fit(self, X, y)
    253 is_auto_select = self.tol is not None and self.n_features_to_select == "auto"
    254 for _ in range(n_iterations):
--> 255     new_feature_idx, new_score = self._get_best_new_feature_score(
    256         cloned_estimator, X, y, cv, current_mask
    257     )
    258     if is_auto_select and ((new_score - old_score) < self.tol):
    259         break

File ~/miniconda3/envs/term2/lib/python3.12/site-packages/sklearn/feature_selection/_sequential.py:286, in SequentialFeatureSelector._get_best_new_feature_score(self, estimator, X, y, cv, current_mask)
    284         candidate_mask = ~candidate_mask
    285     X_new = X[:, candidate_mask]
--> 286     scores[feature_idx] = cross_val_score(
    287         estimator,
    288         X_new,
    289         y,
    290         cv=cv,
    291         scoring=self.scoring,
    292         n_jobs=self.n_jobs,
    293     ).mean()
    294 new_feature_idx = max(scores, key=lambda feature_idx: scores[feature_idx])
    295 return new_feature_idx, scores[new_feature_idx]

File ~/miniconda3/envs/term2/lib/python3.12/site-packages/sklearn/utils/_param_validation.py:213, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    207 try:
    208     with config_context(
    209         skip_parameter_validation=(
    210             prefer_skip_nested_validation or global_skip_validation
    211         )
    212     ):
--> 213         return func(*args, **kwargs)
    214 except InvalidParameterError as e:
    215     # When the function is just a wrapper around an estimator, we allow
    216     # the function to delegate validation to the estimator, but we replace
    217     # the name of the estimator by the name of the function in the error
    218     # message to avoid confusion.
    219     msg = re.sub(
    220         r"parameter of \w+ must be",
    221         f"parameter of {func.__qualname__} must be",
    222         str(e),
    223     )

File ~/miniconda3/envs/term2/lib/python3.12/site-packages/sklearn/model_selection/_validation.py:712, in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, params, pre_dispatch, error_score)
    709 # To ensure multimetric format is not supported
    710 scorer = check_scoring(estimator, scoring=scoring)
--> 712 cv_results = cross_validate(
    713     estimator=estimator,
    714     X=X,
    715     y=y,
    716     groups=groups,
    717     scoring={"score": scorer},
    718     cv=cv,
    719     n_jobs=n_jobs,
    720     verbose=verbose,
    721     fit_params=fit_params,
    722     params=params,
    723     pre_dispatch=pre_dispatch,
    724     error_score=error_score,
    725 )
    726 return cv_results["test_score"]

File ~/miniconda3/envs/term2/lib/python3.12/site-packages/sklearn/utils/_param_validation.py:213, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    207 try:
    208     with config_context(
    209         skip_parameter_validation=(
    210             prefer_skip_nested_validation or global_skip_validation
    211         )
    212     ):
--> 213         return func(*args, **kwargs)
    214 except InvalidParameterError as e:
    215     # When the function is just a wrapper around an estimator, we allow
    216     # the function to delegate validation to the estimator, but we replace
    217     # the name of the estimator by the name of the function in the error
    218     # message to avoid confusion.
    219     msg = re.sub(
    220         r"parameter of \w+ must be",
    221         f"parameter of {func.__qualname__} must be",
    222         str(e),
    223     )

File ~/miniconda3/envs/term2/lib/python3.12/site-packages/sklearn/model_selection/_validation.py:443, in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, params, pre_dispatch, return_train_score, return_estimator, return_indices, error_score)
    422 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
    423 results = parallel(
    424     delayed(_fit_and_score)(
    425         clone(estimator),
   (...)
    440     for train, test in indices
    441 )
--> 443 _warn_or_raise_about_fit_failures(results, error_score)
    445 # For callable scoring, the return type is only know after calling. If the
    446 # return type is a dictionary, the error scores can now be inserted with
    447 # the correct key.
    448 if callable(scoring):

File ~/miniconda3/envs/term2/lib/python3.12/site-packages/sklearn/model_selection/_validation.py:529, in _warn_or_raise_about_fit_failures(results, error_score)
    522 if num_failed_fits == num_fits:
    523     all_fits_failed_message = (
    524         f"\nAll the {num_fits} fits failed.\n"
    525         "It is very likely that your model is misconfigured.\n"
    526         "You can try to debug the error by setting error_score='raise'.\n\n"
    527         f"Below are more details about the failures:\n{fit_errors_summary}"
    528     )
--> 529     raise ValueError(all_fits_failed_message)
    531 else:
    532     some_fits_failed_message = (
    533         f"\n{num_failed_fits} fits failed out of a total of {num_fits}.\n"
    534         "The score on these train-test partitions for these parameters"
   (...)
    538         f"Below are more details about the failures:\n{fit_errors_summary}"
    539     )

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/TabPFN/src/tabpfn/classifier.py", line 453, in fit
    X = _fix_dtypes(X, cat_indices=self.categorical_features_indices)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/TabPFN/src/tabpfn/utils.py", line 480, in _fix_dtypes
    X[cat_indices] = X[cat_indices].astype("category")
                     ~^^^^^^^^^^^^^
  File "/python3.12/site-packages/pandas/core/frame.py", line 4108, in __getitem__
    indexer = self.columns._get_indexer_strict(key, "columns")[1]
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/python3.12/site-packages/pandas/core/indexes/base.py", line 6200, in _get_indexer_strict
    self._raise_if_missing(keyarr, indexer, axis_name)
  File "/python3.12/site-packages/pandas/core/indexes/base.py", line 6249, in _raise_if_missing
    raise KeyError(f"None of [{key}] are in the [{axis_name}]")
KeyError: "None of [Index([ 1,  3,  5,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,\n       22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,\n       40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,\n       58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,\n       76, 77, 78, 79, 80, 81, 82, 83, 84, 85],\n      dtype='int64')] are in the [columns]"

For reference my data frame consists of four numerical columns (index 0, 2, 4, 6) and the remainder are categorical data. Here is how I initialised the classifier.

PFN = TabPFNClassifier(
    random_state=42,
    categorical_features_indices=category_column_indexes,
    softmax_temperature=0.5,
    n_estimators=94,
    balance_probabilities=False,
    average_before_softmax=True,
    inference_config=ModelInterfaceConfig(
      OUTLIER_REMOVAL_STD=1000,
      REGRESSION_Y_PREPROCESS_TRANSFORMS=(None,),
      FINGERPRINT_FEATURE=False,
      PREPROCESS_TRANSFORMS=(PreprocessorConfig("none",),)
    )
)

I tried commenting out feature_names=category_column_indexes , I also tried inputting the category_column_names , I also tried training the TabPFN with categorical columns only, but get the same error each time.

I tried inserting print statements in the classifier source code itself and this is what I got

self.categorical_features_indices: [1, 3, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85]
use_iloc: False
is_numeric_indices: True
columns_are_numeric: True
X.columns: RangeIndex(start=0, stop=86, step=1)
cat_indices: [1, 3, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85]
... (repeats)

本文标签: pythonTabPFN feature selection raises KeyError(fquotNone of key are in the axisnamequot)Stack Overflow