admin管理员组文章数量:1405375
I trained a tabPFN model, which I then tried applying a sequential feature selector for important feature selection. I've been getting this error
KeyError(f"None of [{key}] are in the [{axis_name}]")
on the following code:
Offending code:
from tabpfn_extensions import interpretability
from sklearn.feature_selection import SequentialFeatureSelector
try:
sfs = interpretability.feature_selection.feature_selection(
estimator=PFN,
X=X_train,
y=y_train,
n_features_to_select=5,
feature_names=category_column_indexes,
error_score='raise'
)
except KeyError as e:
print("KeyError:", e)
print("X_train.columns:", X_train.columns)
print("category_column_indexes:", category_column_indexes)
Full error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[5], line 33
4 # category_column_names = [X_train.columns[i] for i in category_column_indexes]
5
6 # X_train.head()
(...)
27 # X_train_selected.columns[i] for i in filtered_cat_indices
28 # ]
32 try:
---> 33 sfs = interpretability.feature_selection.feature_selection(
34 estimator=PFN,
35 X=X_train,
36 y=y_train,
37 n_features_to_select=5,
38 feature_names=category_column_indexes,
39 error_score='raise'
40 )
41 except KeyError as e:
42 print("KeyError:", e)
File /tabpfn-extensions/src/tabpfn_extensions/interpretability/feature_selection.py:29, in feature_selection(estimator, X, y, n_features_to_select, feature_names, **kwargs)
27 estimator.show_progress = show_progress_
28 else:
---> 29 return _feature_selection(
30 estimator, X, y, n_features_to_select, feature_names, **kwargs
31 )
File /tabpfn-extensions/src/tabpfn_extensions/interpretability/feature_selection.py:50, in _feature_selection(estimator, X, y, n_features_to_select, feature_names, **kwargs)
46 # TODO: Feature selection is done without CV, i.e. final CV scores might be biased (too good)
47 sfs = SequentialFeatureSelector(
48 estimator, n_features_to_select=n_features_to_select, direction="forward"
49 )
---> 50 sfs.fit(X, y)
51 sfs.get_support()
52 X_transformed = sfs.transform(X)
File ~/miniconda3/envs/term2/lib/python3.12/site-packages/sklearn/base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
1466 estimator._validate_params()
1468 with config_context(
1469 skip_parameter_validation=(
1470 prefer_skip_nested_validation or global_skip_validation
1471 )
1472 ):
-> 1473 return fit_method(estimator, *args, **kwargs)
File ~/miniconda3/envs/term2/lib/python3.12/site-packages/sklearn/feature_selection/_sequential.py:255, in SequentialFeatureSelector.fit(self, X, y)
253 is_auto_select = self.tol is not None and self.n_features_to_select == "auto"
254 for _ in range(n_iterations):
--> 255 new_feature_idx, new_score = self._get_best_new_feature_score(
256 cloned_estimator, X, y, cv, current_mask
257 )
258 if is_auto_select and ((new_score - old_score) < self.tol):
259 break
File ~/miniconda3/envs/term2/lib/python3.12/site-packages/sklearn/feature_selection/_sequential.py:286, in SequentialFeatureSelector._get_best_new_feature_score(self, estimator, X, y, cv, current_mask)
284 candidate_mask = ~candidate_mask
285 X_new = X[:, candidate_mask]
--> 286 scores[feature_idx] = cross_val_score(
287 estimator,
288 X_new,
289 y,
290 cv=cv,
291 scoring=self.scoring,
292 n_jobs=self.n_jobs,
293 ).mean()
294 new_feature_idx = max(scores, key=lambda feature_idx: scores[feature_idx])
295 return new_feature_idx, scores[new_feature_idx]
File ~/miniconda3/envs/term2/lib/python3.12/site-packages/sklearn/utils/_param_validation.py:213, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
207 try:
208 with config_context(
209 skip_parameter_validation=(
210 prefer_skip_nested_validation or global_skip_validation
211 )
212 ):
--> 213 return func(*args, **kwargs)
214 except InvalidParameterError as e:
215 # When the function is just a wrapper around an estimator, we allow
216 # the function to delegate validation to the estimator, but we replace
217 # the name of the estimator by the name of the function in the error
218 # message to avoid confusion.
219 msg = re.sub(
220 r"parameter of \w+ must be",
221 f"parameter of {func.__qualname__} must be",
222 str(e),
223 )
File ~/miniconda3/envs/term2/lib/python3.12/site-packages/sklearn/model_selection/_validation.py:712, in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, params, pre_dispatch, error_score)
709 # To ensure multimetric format is not supported
710 scorer = check_scoring(estimator, scoring=scoring)
--> 712 cv_results = cross_validate(
713 estimator=estimator,
714 X=X,
715 y=y,
716 groups=groups,
717 scoring={"score": scorer},
718 cv=cv,
719 n_jobs=n_jobs,
720 verbose=verbose,
721 fit_params=fit_params,
722 params=params,
723 pre_dispatch=pre_dispatch,
724 error_score=error_score,
725 )
726 return cv_results["test_score"]
File ~/miniconda3/envs/term2/lib/python3.12/site-packages/sklearn/utils/_param_validation.py:213, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
207 try:
208 with config_context(
209 skip_parameter_validation=(
210 prefer_skip_nested_validation or global_skip_validation
211 )
212 ):
--> 213 return func(*args, **kwargs)
214 except InvalidParameterError as e:
215 # When the function is just a wrapper around an estimator, we allow
216 # the function to delegate validation to the estimator, but we replace
217 # the name of the estimator by the name of the function in the error
218 # message to avoid confusion.
219 msg = re.sub(
220 r"parameter of \w+ must be",
221 f"parameter of {func.__qualname__} must be",
222 str(e),
223 )
File ~/miniconda3/envs/term2/lib/python3.12/site-packages/sklearn/model_selection/_validation.py:443, in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, params, pre_dispatch, return_train_score, return_estimator, return_indices, error_score)
422 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
423 results = parallel(
424 delayed(_fit_and_score)(
425 clone(estimator),
(...)
440 for train, test in indices
441 )
--> 443 _warn_or_raise_about_fit_failures(results, error_score)
445 # For callable scoring, the return type is only know after calling. If the
446 # return type is a dictionary, the error scores can now be inserted with
447 # the correct key.
448 if callable(scoring):
File ~/miniconda3/envs/term2/lib/python3.12/site-packages/sklearn/model_selection/_validation.py:529, in _warn_or_raise_about_fit_failures(results, error_score)
522 if num_failed_fits == num_fits:
523 all_fits_failed_message = (
524 f"\nAll the {num_fits} fits failed.\n"
525 "It is very likely that your model is misconfigured.\n"
526 "You can try to debug the error by setting error_score='raise'.\n\n"
527 f"Below are more details about the failures:\n{fit_errors_summary}"
528 )
--> 529 raise ValueError(all_fits_failed_message)
531 else:
532 some_fits_failed_message = (
533 f"\n{num_failed_fits} fits failed out of a total of {num_fits}.\n"
534 "The score on these train-test partitions for these parameters"
(...)
538 f"Below are more details about the failures:\n{fit_errors_summary}"
539 )
ValueError:
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
File "/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/TabPFN/src/tabpfn/classifier.py", line 453, in fit
X = _fix_dtypes(X, cat_indices=self.categorical_features_indices)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/TabPFN/src/tabpfn/utils.py", line 480, in _fix_dtypes
X[cat_indices] = X[cat_indices].astype("category")
~^^^^^^^^^^^^^
File "/python3.12/site-packages/pandas/core/frame.py", line 4108, in __getitem__
indexer = self.columns._get_indexer_strict(key, "columns")[1]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/python3.12/site-packages/pandas/core/indexes/base.py", line 6200, in _get_indexer_strict
self._raise_if_missing(keyarr, indexer, axis_name)
File "/python3.12/site-packages/pandas/core/indexes/base.py", line 6249, in _raise_if_missing
raise KeyError(f"None of [{key}] are in the [{axis_name}]")
KeyError: "None of [Index([ 1, 3, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,\n 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,\n 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,\n 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,\n 76, 77, 78, 79, 80, 81, 82, 83, 84, 85],\n dtype='int64')] are in the [columns]"
For reference my data frame consists of four numerical columns (index 0, 2, 4, 6) and the remainder are categorical data. Here is how I initialised the classifier.
PFN = TabPFNClassifier(
random_state=42,
categorical_features_indices=category_column_indexes,
softmax_temperature=0.5,
n_estimators=94,
balance_probabilities=False,
average_before_softmax=True,
inference_config=ModelInterfaceConfig(
OUTLIER_REMOVAL_STD=1000,
REGRESSION_Y_PREPROCESS_TRANSFORMS=(None,),
FINGERPRINT_FEATURE=False,
PREPROCESS_TRANSFORMS=(PreprocessorConfig("none",),)
)
)
I tried commenting out feature_names=category_column_indexes
, I also tried inputting the category_column_names
, I also tried training the TabPFN with categorical columns only, but get the same error each time.
I tried inserting print statements in the classifier source code itself and this is what I got
self.categorical_features_indices: [1, 3, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85]
use_iloc: False
is_numeric_indices: True
columns_are_numeric: True
X.columns: RangeIndex(start=0, stop=86, step=1)
cat_indices: [1, 3, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85]
... (repeats)
版权声明:本文标题:python - TabPFN feature selection raises KeyError(f"None of [{key}] are in the [{axis_name}]") - Stack Overflo 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1744266069a2597952.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论