categorical-encoder
25
总安装量
3
周安装量
#14769
全站排名
安装命令
npx skills add https://github.com/dengineproblem/agents-monorepo --skill categorical-encoder
Agent 安装分布
github-copilot
3
amp
2
claude-code
2
kimi-cli
2
gemini-cli
2
Skill 文档
Categorical Encoder Expert
ÐкÑпеÑÑ Ð¿Ð¾ кодиÑÐ¾Ð²Ð°Ð½Ð¸Ñ ÐºÐ°ÑегоÑиалÑнÑÑ Ð¿ÐµÑеменнÑÑ Ð´Ð»Ñ Ð¼Ð°Ñинного обÑÑениÑ.
ÐÑÐ±Ð¾Ñ Ð½Ð° оÑнове каÑдиналÑноÑÑи
| ÐаÑдиналÑноÑÑÑ | РекомендаÑÐ¸Ñ |
|---|---|
| ÐÐ¸Ð·ÐºÐ°Ñ (<10) | One-hot, Dummy |
| СÑеднÑÑ (10-50) | Target, Frequency, Binary |
| ÐÑÑÐ¾ÐºÐ°Ñ (>50) | Hash, Embeddings |
| ÐоÑÑÐ´ÐºÐ¾Ð²Ð°Ñ | Ordinal |
One-Hot Encoding
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
# ÐÐ»Ñ pandas
df_encoded = pd.get_dummies(df, columns=['category_col'], prefix='cat')
# ÐÐ»Ñ sklearn
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train[['category_col']])
X_test_encoded = encoder.transform(X_test[['category_col']])
# ÐолÑÑиÑÑ Ð½Ð°Ð·Ð²Ð°Ð½Ð¸Ñ Ð¿Ñизнаков
feature_names = encoder.get_feature_names_out(['category_col'])
Target Encoding Ñ ÐºÑоÑÑ-валидаÑией
from sklearn.model_selection import KFold
import numpy as np
def target_encode_cv(X, y, column, n_splits=5, alpha=1.0):
"""
Target кодиÑование Ñ CV Ð´Ð»Ñ Ð¿ÑедоÑвÑаÑÐµÐ½Ð¸Ñ Ð¿ÐµÑеобÑÑениÑ
"""
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
encoded = np.zeros(len(X))
global_mean = y.mean()
for train_idx, val_idx in kf.split(X):
# ÐÑÑиÑлиÑÑ ÑÑедние на ÑÑениÑовоÑной вÑбоÑке
category_means = y.iloc[train_idx].groupby(
X[column].iloc[train_idx]
).mean()
# ÐайеÑовÑкое Ñглаживание
category_counts = X[column].iloc[train_idx].value_counts()
smoothed_means = (
category_counts * category_means + alpha * global_mean
) / (category_counts + alpha)
# ÐакодиÑоваÑÑ Ð²Ð°Ð»Ð¸Ð´Ð°ÑионнÑÑ Ð²ÑбоÑкÑ
encoded[val_idx] = X[column].iloc[val_idx].map(
smoothed_means
).fillna(global_mean)
return encoded
Binary Encoding
import category_encoders as ce
# Binary кодиÑование ÑменÑÑÐ°ÐµÑ ÑазмеÑноÑÑÑ
binary_encoder = ce.BinaryEncoder(cols=['high_cardinality_col'])
X_train_binary = binary_encoder.fit_transform(X_train)
X_test_binary = binary_encoder.transform(X_test)
# ÐÐ»Ñ 100 каÑегоÑий: one-hot = 100, binary = 7 пÑизнаков
print(f"ÐÑÑ
однÑÑ
каÑегоÑий: {X_train['col'].nunique()}")
print(f"Binary пÑизнаков: {len([c for c in X_train_binary.columns if 'col' in c])}")
Frequency и Count Encoding
def frequency_encode(train_series, test_series=None):
"""ÐодиÑование по ÑаÑÑоÑе поÑвлениÑ"""
freq_map = train_series.value_counts(normalize=True).to_dict()
train_encoded = train_series.map(freq_map)
if test_series is not None:
test_encoded = test_series.map(freq_map).fillna(0)
return train_encoded, test_encoded
return train_encoded
def count_encode(train_series, test_series=None):
"""ÐодиÑование по колиÑеÑÑвÑ"""
count_map = train_series.value_counts().to_dict()
train_encoded = train_series.map(count_map)
if test_series is not None:
test_encoded = test_series.map(count_map).fillna(0)
return train_encoded, test_encoded
return train_encoded
Embeddings Ð´Ð»Ñ Ð²ÑÑокой каÑдиналÑноÑÑи
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import OneHotEncoder
def create_categorical_embeddings(X_train, X_test, column, n_components=10):
"""СоздаÑÑ Ñмбеддинги из one-hot"""
# One-hot кодиÑование
encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
X_train_oh = encoder.fit_transform(X_train[[column]])
X_test_oh = encoder.transform(X_test[[column]])
# Ðонижение ÑазмеÑноÑÑи
svd = TruncatedSVD(n_components=n_components, random_state=42)
X_train_emb = svd.fit_transform(X_train_oh)
X_test_emb = svd.transform(X_test_oh)
return X_train_emb, X_test_emb, encoder, svd
Multiple Encoding Strategy
def multi_encode_categorical(df, column, target=None):
"""СоздаÑÑ Ð¼Ð½Ð¾Ð¶ÐµÑÑвеннÑе кодиÑованиÑ"""
encodings = {}
# Frequency
encodings[f'{column}_freq'] = frequency_encode(df[column])
# Count
encodings[f'{column}_count'] = count_encode(df[column])
# Target (еÑли еÑÑÑ)
if target is not None:
encodings[f'{column}_target'] = target_encode_cv(df, target, column)
# Ordinal Ð´Ð»Ñ Ð´ÑевеÑнÑÑ
моделей
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
encodings[f'{column}_ordinal'] = le.fit_transform(df[column])
return pd.DataFrame(encodings)
Production-ready Encoder
class RobustCategoricalEncoder:
def __init__(self, encoding_type='onehot', handle_unknown='mode'):
self.encoding_type = encoding_type
self.handle_unknown = handle_unknown
self.encoders = {}
self.fallback_values = {}
def fit(self, X, y=None):
for column in X.select_dtypes(include=['object', 'category']).columns:
if self.encoding_type == 'onehot':
encoder = OneHotEncoder(
sparse_output=False,
handle_unknown='ignore'
)
encoder.fit(X[[column]])
self.encoders[column] = encoder
elif self.encoding_type == 'target' and y is not None:
target_map = y.groupby(X[column]).mean().to_dict()
self.encoders[column] = target_map
self.fallback_values[column] = y.mean()
return self
def transform(self, X):
X_transformed = X.copy()
for column, encoder in self.encoders.items():
if self.encoding_type == 'onehot':
encoded = encoder.transform(X_transformed[[column]])
feature_names = encoder.get_feature_names_out([column])
encoded_df = pd.DataFrame(
encoded,
columns=feature_names,
index=X.index
)
X_transformed = pd.concat([
X_transformed.drop(column, axis=1),
encoded_df
], axis=1)
elif self.encoding_type == 'target':
X_transformed[column] = X_transformed[column].map(
encoder
).fillna(self.fallback_values[column])
return X_transformed
РекомендаÑии Ð´Ð»Ñ Ð¼Ð¾Ð´ÐµÐ»ÐµÐ¹
| ÐÐ¾Ð´ÐµÐ»Ñ | РекомендÑемое кодиÑование |
|---|---|
| ÐÑевеÑнÑе (RF, XGB) | Ordinal, Target, Frequency |
| ÐинейнÑе (LR, SVM) | One-hot, избегаÑÑ ordinal |
| ÐейÑоннÑе ÑеÑи | Embeddings Ð´Ð»Ñ Ð²ÑÑокой каÑдиналÑноÑÑи |
| Ðа оÑнове ÑаÑÑÑоÑÐ½Ð¸Ñ | СÑандаÑÑизиÑованнÑе закодиÑованнÑе |
ÐÑедоÑвÑаÑение ÑÑеÑки даннÑÑ
# ÐÐ ÐÐÐÐЬÐÐ: fit ÑолÑко на train
encoder.fit(X_train)
X_train_enc = encoder.transform(X_train)
X_test_enc = encoder.transform(X_test)
# ÐÐÐÐ ÐÐÐÐЬÐÐ: fit на вÑеÑ
даннÑÑ
encoder.fit(X_all) # УÑеÑка!
ÐалидаÑиÑ
def validate_encoding(X_original, X_encoded):
"""ÐалидиÑоваÑÑ ÑезÑлÑÑаÑÑ ÐºÐ¾Ð´Ð¸ÑованиÑ"""
print(f"ÐÑÑ
Ð¾Ð´Ð½Ð°Ñ ÑазмеÑноÑÑÑ: {X_original.shape}")
print(f"ÐакодиÑÐ¾Ð²Ð°Ð½Ð½Ð°Ñ ÑазмеÑноÑÑÑ: {X_encoded.shape}")
print(f"ÐамÑÑÑ: {X_encoded.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
# ÐÑовеÑиÑÑ NaN
null_count = X_encoded.isnull().sum().sum()
if null_count > 0:
print(f"ÐÑедÑпÑеждение: {null_count} пÑÑÑÑÑ
знаÑений")
# ÐоÑÑÑиÑÐ¸ÐµÐ½Ñ ÑаÑÑиÑениÑ
print(f"РаÑÑиÑение: {X_encoded.shape[1] / X_original.shape[1]:.2f}x")
ÐÑÑÑие пÑакÑики
- Fit ÑолÑко на train â избегайÑе ÑÑеÑки даннÑÑ
- ÐбÑабаÑÑвайÑе unknown â иÑполÑзÑйÑе fallback ÑÑÑаÑегиÑ
- ÐÑполÑзÑйÑе CV Ð´Ð»Ñ target encoding â пÑедоÑвÑаÑÐ°ÐµÑ Ð¿ÐµÑеобÑÑение
- ÐониÑоÑÑÑе ÑазмеÑноÑÑÑ â one-hot взÑÑÐ²Ð°ÐµÑ ÑазмеÑноÑÑÑ
- ÐÑбиÑайÑе по модели â ÑазнÑе модели пÑедпоÑиÑаÑÑ Ñазное