ab-test-framework-ml
29
总安装量
3
周安装量
#12595
全站排名
安装命令
npx skills add https://github.com/dengineproblem/agents-monorepo --skill ab-test-framework-ml
Agent 安装分布
github-copilot
3
amp
2
claude-code
2
kimi-cli
2
gemini-cli
2
Skill 文档
A/B Test ÑÑеймвоÑк Ð´Ð»Ñ Machine Learning
ÐÑ ÑкÑпеÑÑ Ð¿Ð¾ пÑоекÑиÑованиÑ, ÑеализаÑии и Ð°Ð½Ð°Ð»Ð¸Ð·Ñ A/B ÑеÑÑов ÑпеÑиалÑно Ð´Ð»Ñ ÑиÑÑем маÑинного обÑÑениÑ. ÐÑ Ð¿Ð¾Ð½Ð¸Ð¼Ð°ÐµÑе ÑникалÑнÑе вÑÐ·Ð¾Ð²Ñ ÑеÑÑиÑÐ¾Ð²Ð°Ð½Ð¸Ñ ML моделей в пÑодакÑене, вклÑÑÐ°Ñ Ð´ÑиÑÑ ÐºÐ¾Ð½ÑепÑий, ÑмеÑение моделей, ÑаÑÑеÑÑ ÑÑаÑиÑÑиÑеÑкой моÑноÑÑи и ÑложноÑÑи измеÑÐµÐ½Ð¸Ñ ÐºÐ°Ðº бизнеÑ-меÑÑик, Ñак и меÑÑик пÑоизводиÑелÑноÑÑи моделей.
ÐÑновнÑе пÑинÑÐ¸Ð¿Ñ ML A/B ÑеÑÑиÑованиÑ
СÑаÑиÑÑиÑеÑÐºÐ°Ñ ÑÑÑогоÑÑÑ
- ÐÑегда опÑеделÑйÑе пеÑвиÑнÑе и вÑоÑиÑнÑе меÑÑики пеÑед запÑÑком ÑкÑпеÑименÑа
- РаÑÑÑиÑÑвайÑе минималÑно деÑекÑиÑÑемÑй ÑÑÑÐµÐºÑ (MDE) и Ð½ÐµÐ¾Ð±Ñ Ð¾Ð´Ð¸Ð¼Ñе ÑазмеÑÑ Ð²ÑбоÑки заÑанее
- УÑиÑÑвайÑе коÑÑекÑии множеÑÑвенного ÑеÑÑиÑÐ¾Ð²Ð°Ð½Ð¸Ñ Ð¿Ñи оÑенке неÑколÑÐºÐ¸Ñ Ð¼ÐµÑÑик
- ÐÑполÑзÑйÑе пÑавилÑнÑе единиÑÑ ÑандомизаÑии (ÑÑÐ¾Ð²ÐµÐ½Ñ Ð¿Ð¾Ð»ÑзоваÑелÑ, ÑеÑÑии или запÑоÑа)
ML-ÑпеÑиÑиÑнÑе ÑообÑажениÑ
- ÐониÑоÑÑÑе как меÑÑики пÑоизводиÑелÑноÑÑи модели (ÑоÑноÑÑÑ, AUC, precision/recall), Ñак и бизнеÑ-меÑÑики (конвеÑÑиÑ, вÑÑÑÑка, вовлеÑенноÑÑÑ)
- УÑиÑÑвайÑе задеÑÐ¶ÐºÑ Ð¸Ð½ÑеÑенÑа модели и вÑÑиÑлиÑелÑнÑе заÑÑаÑÑ Ð² ваÑем анализе
- РаÑÑмаÑÑивайÑе вÑеменнÑе ÑÑÑекÑÑ Ð¸ ÑезонноÑÑÑ Ð¿Ñи анализе ÑезÑлÑÑаÑов
- ÐбÑабаÑÑвайÑе веÑÑиониÑование моделей и воÑпÑоизводимоÑÑÑ Ð½Ð° пÑоÑÑжении вÑего ÑкÑпеÑименÑа
ФÑеймвоÑк дизайна ÑкÑпеÑименÑов
РаÑÑÐµÑ ÑазмеÑа вÑбоÑки
import numpy as np
from scipy import stats
from statsmodels.stats.power import ttest_power
def calculate_sample_size(baseline_rate, mde, alpha=0.05, power=0.8):
"""
Calculate required sample size for A/B test
Args:
baseline_rate: Current conversion/success rate
mde: Minimum detectable effect (relative change)
alpha: Type I error rate
power: Statistical power (1 - Type II error)
"""
effect_size = mde * baseline_rate / np.sqrt(baseline_rate * (1 - baseline_rate))
n = ttest_power(effect_size, power=power, alpha=alpha, alternative='two-sided')
return int(np.ceil(n))
# Example: Need to detect 5% relative improvement in 20% baseline conversion
sample_size = calculate_sample_size(baseline_rate=0.20, mde=0.05)
print(f"Required sample size per variant: {sample_size}")
РандомизаÑÐ¸Ñ Ð¸ Ñазделение ÑÑаÑика
import hashlib
import random
class ABTestSplitter:
def __init__(self, experiment_name, traffic_allocation=0.1, control_ratio=0.5):
self.experiment_name = experiment_name
self.traffic_allocation = traffic_allocation
self.control_ratio = control_ratio
def get_variant(self, user_id):
# Consistent hashing for user assignment
hash_input = f"{self.experiment_name}_{user_id}"
hash_value = int(hashlib.md5(hash_input.encode()).hexdigest()[:8], 16)
bucket = hash_value / (2**32) # Normalize to [0,1)
# Check if user is in experiment
if bucket >= self.traffic_allocation:
return "not_in_experiment"
# Assign to control or treatment
experiment_bucket = bucket / self.traffic_allocation
if experiment_bucket < self.control_ratio:
return "control"
else:
return "treatment"
# Usage
splitter = ABTestSplitter("model_v2_test", traffic_allocation=0.2, control_ratio=0.5)
variant = splitter.get_variant("user_12345")
Ðеплой модели и мониÑоÑинг
ÐнÑегÑаÑÐ¸Ñ Ñ Feature Store
class ABTestModelServer:
def __init__(self, control_model, treatment_model, splitter):
self.control_model = control_model
self.treatment_model = treatment_model
self.splitter = splitter
self.metrics_logger = MetricsLogger()
def predict(self, user_id, features):
variant = self.splitter.get_variant(user_id)
start_time = time.time()
if variant == "control":
prediction = self.control_model.predict(features)
model_version = "control"
elif variant == "treatment":
prediction = self.treatment_model.predict(features)
model_version = "treatment"
else:
prediction = self.control_model.predict(features)
model_version = "control"
latency = time.time() - start_time
# Log prediction and metadata
self.metrics_logger.log_prediction({
'user_id': user_id,
'variant': variant,
'model_version': model_version,
'prediction': prediction,
'latency_ms': latency * 1000,
'timestamp': time.time()
})
return prediction, variant
ФÑеймвоÑк ÑÑаÑиÑÑиÑеÑкого анализа
ÐайеÑовÑкий анализ A/B ÑеÑÑов
import pymc3 as pm
import arviz as az
def bayesian_ab_test(control_conversions, control_total,
treatment_conversions, treatment_total):
"""
Bayesian analysis for conversion rate A/B test
"""
with pm.Model() as model:
# Priors
alpha_control = pm.Beta('alpha_control', alpha=1, beta=1)
alpha_treatment = pm.Beta('alpha_treatment', alpha=1, beta=1)
# Likelihood
control_obs = pm.Binomial('control_obs',
n=control_total,
p=alpha_control,
observed=control_conversions)
treatment_obs = pm.Binomial('treatment_obs',
n=treatment_total,
p=alpha_treatment,
observed=treatment_conversions)
# Derived quantities
lift = pm.Deterministic('lift',
(alpha_treatment - alpha_control) / alpha_control)
# Sample
trace = pm.sample(2000, tune=1000, return_inferencedata=True)
# Calculate probability of positive lift
prob_positive = (trace.posterior.lift > 0).mean().item()
return trace, prob_positive
ÐоÑледоваÑелÑное ÑеÑÑиÑование и доÑÑоÑÐ½Ð°Ñ Ð¾ÑÑановка
class SequentialABTest:
def __init__(self, alpha=0.05, beta=0.2, mde=0.05):
self.alpha = alpha
self.beta = beta
self.mde = mde
self.data_points = []
def add_observation(self, variant, outcome):
self.data_points.append({'variant': variant, 'outcome': outcome})
def should_stop(self):
if len(self.data_points) < 100: # Minimum sample size
return False, "continue"
# Calculate current test statistic
control_outcomes = [d['outcome'] for d in self.data_points
if d['variant'] == 'control']
treatment_outcomes = [d['outcome'] for d in self.data_points
if d['variant'] == 'treatment']
if len(control_outcomes) < 50 or len(treatment_outcomes) < 50:
return False, "continue"
# Perform t-test
t_stat, p_value = stats.ttest_ind(treatment_outcomes, control_outcomes)
# O'Brien-Fleming spending function for alpha adjustment
current_n = len(self.data_points)
max_n = self.calculate_max_sample_size()
information_fraction = current_n / max_n
adjusted_alpha = self.obf_spending_function(information_fraction)
if p_value < adjusted_alpha:
return True, "significant"
elif current_n >= max_n:
return True, "max_sample_reached"
else:
return False, "continue"
ÐониÑоÑинг пÑоизводиÑелÑноÑÑи модели
ÐеÑекÑÐ¸Ñ Ð´ÑиÑÑа
from scipy.stats import ks_2samp
from scipy.spatial.distance import jensenshannon
class ModelDriftMonitor:
def __init__(self, baseline_predictions, threshold=0.05):
self.baseline_predictions = baseline_predictions
self.threshold = threshold
def detect_prediction_drift(self, current_predictions):
# Kolmogorov-Smirnov test for distribution shift
ks_stat, ks_pvalue = ks_2samp(self.baseline_predictions,
current_predictions)
# Jensen-Shannon divergence
js_divergence = jensenshannon(self.baseline_predictions,
current_predictions)
drift_detected = ks_pvalue < self.threshold or js_divergence > 0.1
return {
'drift_detected': drift_detected,
'ks_statistic': ks_stat,
'ks_pvalue': ks_pvalue,
'js_divergence': js_divergence
}
ÐÑÑÑие пÑакÑики и ÑекомендаÑии
ÐонÑигÑÑаÑÐ¸Ñ ÑкÑпеÑименÑов
- ÐÑполÑзÑйÑе конÑигÑÑаÑионнÑе ÑÐ°Ð¹Ð»Ñ Ð´Ð»Ñ ÑпÑÐ°Ð²Ð»ÐµÐ½Ð¸Ñ Ð¿Ð°ÑамеÑÑами ÑкÑпеÑименÑов
- РеализÑйÑе пÑавилÑное логиÑование вÑÐµÑ Ð¿ÑедÑказаний модели
- ÐаÑÑÑойÑе авÑомаÑиÑеÑкие ÑÐ²ÐµÐ´Ð¾Ð¼Ð»ÐµÐ½Ð¸Ñ Ð´Ð»Ñ ÑÐ½Ð¸Ð¶ÐµÐ½Ð¸Ñ Ð¿ÑоизводиÑелÑноÑÑи
- ÐоддеÑживайÑе оÑделÑнÑе окÑÑÐ¶ÐµÐ½Ð¸Ñ Ð´Ð»Ñ ÑазÑабоÑки и пÑодакÑена
Ðнализ и оÑÑеÑноÑÑÑ
- ÐÑегда ÑообÑайÑе довеÑиÑелÑнÑе инÑеÑвалÑ, а не ÑолÑко ÑоÑеÑнÑе оÑенки
- ÐклÑÑайÑе как пÑакÑиÑеÑкÑÑ, Ñак и ÑÑаÑиÑÑиÑеÑкÑÑ Ð·Ð½Ð°ÑимоÑÑÑ
- ÐÑполнÑйÑе пÑовеÑки ÑÑÑойÑивоÑÑи Ñ ÑазлиÑнÑми Ð¿Ð¾Ð´Ñ Ð¾Ð´Ð°Ð¼Ð¸
- ÐокÑменÑиÑÑйÑе наÑÑÑÐµÐ½Ð¸Ñ Ð¿Ñедположений и Ð¸Ñ Ð²Ð»Ð¸Ñние
ТипиÑнÑе оÑибки
- Ðе заглÑдÑвайÑе в ÑезÑлÑÑаÑÑ Ð±ÐµÐ· коÑÑекÑиÑовки множеÑÑвенного ÑеÑÑиÑованиÑ
- ÐзбегайÑе Ð¸Ð·Ð¼ÐµÐ½ÐµÐ½Ð¸Ñ Ð¿Ð°ÑамеÑÑов ÑкÑпеÑименÑа в пÑоÑеÑÑе
- Ðе игноÑиÑÑйÑе ÑазлиÑÐ¸Ñ Ð² задеÑжке модели
- УбедиÑеÑÑ, ÑÑо единиÑа ÑандомизаÑии ÑооÑвеÑÑÑвÑÐµÑ ÐµÐ´Ð¸Ð½Ð¸Ñе анализа