663 lines
22 KiB
Python
663 lines
22 KiB
Python
"""
|
|
A/B testing module for App Store Optimization.
|
|
Plans and tracks A/B tests for metadata and visual assets.
|
|
"""
|
|
|
|
from typing import Dict, List, Any, Optional
|
|
import math
|
|
|
|
|
|
class ABTestPlanner:
|
|
"""Plans and tracks A/B tests for ASO elements."""
|
|
|
|
# Minimum detectable effect sizes (conservative estimates)
|
|
MIN_EFFECT_SIZES = {
|
|
'icon': 0.10, # 10% conversion improvement
|
|
'screenshot': 0.08, # 8% conversion improvement
|
|
'title': 0.05, # 5% conversion improvement
|
|
'description': 0.03 # 3% conversion improvement
|
|
}
|
|
|
|
# Statistical confidence levels
|
|
CONFIDENCE_LEVELS = {
|
|
'high': 0.95, # 95% confidence
|
|
'standard': 0.90, # 90% confidence
|
|
'exploratory': 0.80 # 80% confidence
|
|
}
|
|
|
|
def __init__(self):
|
|
"""Initialize A/B test planner."""
|
|
self.active_tests = []
|
|
|
|
def design_test(
|
|
self,
|
|
test_type: str,
|
|
variant_a: Dict[str, Any],
|
|
variant_b: Dict[str, Any],
|
|
hypothesis: str,
|
|
success_metric: str = 'conversion_rate'
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Design an A/B test with hypothesis and variables.
|
|
|
|
Args:
|
|
test_type: Type of test ('icon', 'screenshot', 'title', 'description')
|
|
variant_a: Control variant details
|
|
variant_b: Test variant details
|
|
hypothesis: Expected outcome hypothesis
|
|
success_metric: Metric to optimize
|
|
|
|
Returns:
|
|
Test design with configuration
|
|
"""
|
|
test_design = {
|
|
'test_id': self._generate_test_id(test_type),
|
|
'test_type': test_type,
|
|
'hypothesis': hypothesis,
|
|
'variants': {
|
|
'a': {
|
|
'name': 'Control',
|
|
'details': variant_a,
|
|
'traffic_split': 0.5
|
|
},
|
|
'b': {
|
|
'name': 'Variation',
|
|
'details': variant_b,
|
|
'traffic_split': 0.5
|
|
}
|
|
},
|
|
'success_metric': success_metric,
|
|
'secondary_metrics': self._get_secondary_metrics(test_type),
|
|
'minimum_effect_size': self.MIN_EFFECT_SIZES.get(test_type, 0.05),
|
|
'recommended_confidence': 'standard',
|
|
'best_practices': self._get_test_best_practices(test_type)
|
|
}
|
|
|
|
self.active_tests.append(test_design)
|
|
return test_design
|
|
|
|
def calculate_sample_size(
|
|
self,
|
|
baseline_conversion: float,
|
|
minimum_detectable_effect: float,
|
|
confidence_level: str = 'standard',
|
|
power: float = 0.80
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Calculate required sample size for statistical significance.
|
|
|
|
Args:
|
|
baseline_conversion: Current conversion rate (0-1)
|
|
minimum_detectable_effect: Minimum effect size to detect (0-1)
|
|
confidence_level: 'high', 'standard', or 'exploratory'
|
|
power: Statistical power (typically 0.80 or 0.90)
|
|
|
|
Returns:
|
|
Sample size calculation with duration estimates
|
|
"""
|
|
alpha = 1 - self.CONFIDENCE_LEVELS[confidence_level]
|
|
beta = 1 - power
|
|
|
|
# Expected conversion for variant B
|
|
expected_conversion_b = baseline_conversion * (1 + minimum_detectable_effect)
|
|
|
|
# Z-scores for alpha and beta
|
|
z_alpha = self._get_z_score(1 - alpha / 2) # Two-tailed test
|
|
z_beta = self._get_z_score(power)
|
|
|
|
# Pooled standard deviation
|
|
p_pooled = (baseline_conversion + expected_conversion_b) / 2
|
|
sd_pooled = math.sqrt(2 * p_pooled * (1 - p_pooled))
|
|
|
|
# Sample size per variant
|
|
n_per_variant = math.ceil(
|
|
((z_alpha + z_beta) ** 2 * sd_pooled ** 2) /
|
|
((expected_conversion_b - baseline_conversion) ** 2)
|
|
)
|
|
|
|
total_sample_size = n_per_variant * 2
|
|
|
|
# Estimate duration based on typical traffic
|
|
duration_estimates = self._estimate_test_duration(
|
|
total_sample_size,
|
|
baseline_conversion
|
|
)
|
|
|
|
return {
|
|
'sample_size_per_variant': n_per_variant,
|
|
'total_sample_size': total_sample_size,
|
|
'baseline_conversion': baseline_conversion,
|
|
'expected_conversion_improvement': minimum_detectable_effect,
|
|
'expected_conversion_b': expected_conversion_b,
|
|
'confidence_level': confidence_level,
|
|
'statistical_power': power,
|
|
'duration_estimates': duration_estimates,
|
|
'recommendations': self._generate_sample_size_recommendations(
|
|
n_per_variant,
|
|
duration_estimates
|
|
)
|
|
}
|
|
|
|
def calculate_significance(
|
|
self,
|
|
variant_a_conversions: int,
|
|
variant_a_visitors: int,
|
|
variant_b_conversions: int,
|
|
variant_b_visitors: int
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Calculate statistical significance of test results.
|
|
|
|
Args:
|
|
variant_a_conversions: Conversions for control
|
|
variant_a_visitors: Visitors for control
|
|
variant_b_conversions: Conversions for variation
|
|
variant_b_visitors: Visitors for variation
|
|
|
|
Returns:
|
|
Significance analysis with decision recommendation
|
|
"""
|
|
# Calculate conversion rates
|
|
rate_a = variant_a_conversions / variant_a_visitors if variant_a_visitors > 0 else 0
|
|
rate_b = variant_b_conversions / variant_b_visitors if variant_b_visitors > 0 else 0
|
|
|
|
# Calculate improvement
|
|
if rate_a > 0:
|
|
relative_improvement = (rate_b - rate_a) / rate_a
|
|
else:
|
|
relative_improvement = 0
|
|
|
|
absolute_improvement = rate_b - rate_a
|
|
|
|
# Calculate standard error
|
|
se_a = math.sqrt(rate_a * (1 - rate_a) / variant_a_visitors) if variant_a_visitors > 0 else 0
|
|
se_b = math.sqrt(rate_b * (1 - rate_b) / variant_b_visitors) if variant_b_visitors > 0 else 0
|
|
se_diff = math.sqrt(se_a**2 + se_b**2)
|
|
|
|
# Calculate z-score
|
|
z_score = absolute_improvement / se_diff if se_diff > 0 else 0
|
|
|
|
# Calculate p-value (two-tailed)
|
|
p_value = 2 * (1 - self._standard_normal_cdf(abs(z_score)))
|
|
|
|
# Determine significance
|
|
is_significant_95 = p_value < 0.05
|
|
is_significant_90 = p_value < 0.10
|
|
|
|
# Generate decision
|
|
decision = self._generate_test_decision(
|
|
relative_improvement,
|
|
is_significant_95,
|
|
is_significant_90,
|
|
variant_a_visitors + variant_b_visitors
|
|
)
|
|
|
|
return {
|
|
'variant_a': {
|
|
'conversions': variant_a_conversions,
|
|
'visitors': variant_a_visitors,
|
|
'conversion_rate': round(rate_a, 4)
|
|
},
|
|
'variant_b': {
|
|
'conversions': variant_b_conversions,
|
|
'visitors': variant_b_visitors,
|
|
'conversion_rate': round(rate_b, 4)
|
|
},
|
|
'improvement': {
|
|
'absolute': round(absolute_improvement, 4),
|
|
'relative_percentage': round(relative_improvement * 100, 2)
|
|
},
|
|
'statistical_analysis': {
|
|
'z_score': round(z_score, 3),
|
|
'p_value': round(p_value, 4),
|
|
'is_significant_95': is_significant_95,
|
|
'is_significant_90': is_significant_90,
|
|
'confidence_level': '95%' if is_significant_95 else ('90%' if is_significant_90 else 'Not significant')
|
|
},
|
|
'decision': decision
|
|
}
|
|
|
|
def track_test_results(
|
|
self,
|
|
test_id: str,
|
|
results_data: Dict[str, Any]
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Track ongoing test results and provide recommendations.
|
|
|
|
Args:
|
|
test_id: Test identifier
|
|
results_data: Current test results
|
|
|
|
Returns:
|
|
Test tracking report with next steps
|
|
"""
|
|
# Find test
|
|
test = next((t for t in self.active_tests if t['test_id'] == test_id), None)
|
|
if not test:
|
|
return {'error': f'Test {test_id} not found'}
|
|
|
|
# Calculate significance
|
|
significance = self.calculate_significance(
|
|
results_data['variant_a_conversions'],
|
|
results_data['variant_a_visitors'],
|
|
results_data['variant_b_conversions'],
|
|
results_data['variant_b_visitors']
|
|
)
|
|
|
|
# Calculate test progress
|
|
total_visitors = results_data['variant_a_visitors'] + results_data['variant_b_visitors']
|
|
required_sample = results_data.get('required_sample_size', 10000)
|
|
progress_percentage = min((total_visitors / required_sample) * 100, 100)
|
|
|
|
# Generate recommendations
|
|
recommendations = self._generate_tracking_recommendations(
|
|
significance,
|
|
progress_percentage,
|
|
test['test_type']
|
|
)
|
|
|
|
return {
|
|
'test_id': test_id,
|
|
'test_type': test['test_type'],
|
|
'progress': {
|
|
'total_visitors': total_visitors,
|
|
'required_sample_size': required_sample,
|
|
'progress_percentage': round(progress_percentage, 1),
|
|
'is_complete': progress_percentage >= 100
|
|
},
|
|
'current_results': significance,
|
|
'recommendations': recommendations,
|
|
'next_steps': self._determine_next_steps(
|
|
significance,
|
|
progress_percentage
|
|
)
|
|
}
|
|
|
|
def generate_test_report(
|
|
self,
|
|
test_id: str,
|
|
final_results: Dict[str, Any]
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Generate final test report with insights and recommendations.
|
|
|
|
Args:
|
|
test_id: Test identifier
|
|
final_results: Final test results
|
|
|
|
Returns:
|
|
Comprehensive test report
|
|
"""
|
|
test = next((t for t in self.active_tests if t['test_id'] == test_id), None)
|
|
if not test:
|
|
return {'error': f'Test {test_id} not found'}
|
|
|
|
significance = self.calculate_significance(
|
|
final_results['variant_a_conversions'],
|
|
final_results['variant_a_visitors'],
|
|
final_results['variant_b_conversions'],
|
|
final_results['variant_b_visitors']
|
|
)
|
|
|
|
# Generate insights
|
|
insights = self._generate_test_insights(
|
|
test,
|
|
significance,
|
|
final_results
|
|
)
|
|
|
|
# Implementation plan
|
|
implementation_plan = self._create_implementation_plan(
|
|
test,
|
|
significance
|
|
)
|
|
|
|
return {
|
|
'test_summary': {
|
|
'test_id': test_id,
|
|
'test_type': test['test_type'],
|
|
'hypothesis': test['hypothesis'],
|
|
'duration_days': final_results.get('duration_days', 'N/A')
|
|
},
|
|
'results': significance,
|
|
'insights': insights,
|
|
'implementation_plan': implementation_plan,
|
|
'learnings': self._extract_learnings(test, significance)
|
|
}
|
|
|
|
def _generate_test_id(self, test_type: str) -> str:
|
|
"""Generate unique test ID."""
|
|
import time
|
|
timestamp = int(time.time())
|
|
return f"{test_type}_{timestamp}"
|
|
|
|
def _get_secondary_metrics(self, test_type: str) -> List[str]:
|
|
"""Get secondary metrics to track for test type."""
|
|
metrics_map = {
|
|
'icon': ['tap_through_rate', 'impression_count', 'brand_recall'],
|
|
'screenshot': ['tap_through_rate', 'time_on_page', 'scroll_depth'],
|
|
'title': ['impression_count', 'tap_through_rate', 'search_visibility'],
|
|
'description': ['time_on_page', 'scroll_depth', 'tap_through_rate']
|
|
}
|
|
return metrics_map.get(test_type, ['tap_through_rate'])
|
|
|
|
def _get_test_best_practices(self, test_type: str) -> List[str]:
|
|
"""Get best practices for specific test type."""
|
|
practices_map = {
|
|
'icon': [
|
|
'Test only one element at a time (color vs. style vs. symbolism)',
|
|
'Ensure icon is recognizable at small sizes (60x60px)',
|
|
'Consider cultural context for global audience',
|
|
'Test against top competitor icons'
|
|
],
|
|
'screenshot': [
|
|
'Test order of screenshots (users see first 2-3)',
|
|
'Use captions to tell story',
|
|
'Show key features and benefits',
|
|
'Test with and without device frames'
|
|
],
|
|
'title': [
|
|
'Test keyword variations, not major rebrand',
|
|
'Keep brand name consistent',
|
|
'Ensure title fits within character limits',
|
|
'Test on both search and browse contexts'
|
|
],
|
|
'description': [
|
|
'Test structure (bullet points vs. paragraphs)',
|
|
'Test call-to-action placement',
|
|
'Test feature vs. benefit focus',
|
|
'Maintain keyword density'
|
|
]
|
|
}
|
|
return practices_map.get(test_type, ['Test one variable at a time'])
|
|
|
|
def _estimate_test_duration(
|
|
self,
|
|
required_sample_size: int,
|
|
baseline_conversion: float
|
|
) -> Dict[str, Any]:
|
|
"""Estimate test duration based on typical traffic levels."""
|
|
# Assume different daily traffic scenarios
|
|
traffic_scenarios = {
|
|
'low': 100, # 100 page views/day
|
|
'medium': 1000, # 1000 page views/day
|
|
'high': 10000 # 10000 page views/day
|
|
}
|
|
|
|
estimates = {}
|
|
for scenario, daily_views in traffic_scenarios.items():
|
|
days = math.ceil(required_sample_size / daily_views)
|
|
estimates[scenario] = {
|
|
'daily_page_views': daily_views,
|
|
'estimated_days': days,
|
|
'estimated_weeks': round(days / 7, 1)
|
|
}
|
|
|
|
return estimates
|
|
|
|
def _generate_sample_size_recommendations(
|
|
self,
|
|
sample_size: int,
|
|
duration_estimates: Dict[str, Any]
|
|
) -> List[str]:
|
|
"""Generate recommendations based on sample size."""
|
|
recommendations = []
|
|
|
|
if sample_size > 50000:
|
|
recommendations.append(
|
|
"Large sample size required - consider testing smaller effect size or increasing traffic"
|
|
)
|
|
|
|
if duration_estimates['medium']['estimated_days'] > 30:
|
|
recommendations.append(
|
|
"Long test duration - consider higher minimum detectable effect or focus on high-impact changes"
|
|
)
|
|
|
|
if duration_estimates['low']['estimated_days'] > 60:
|
|
recommendations.append(
|
|
"Insufficient traffic for reliable testing - consider user acquisition or broader targeting"
|
|
)
|
|
|
|
if not recommendations:
|
|
recommendations.append("Sample size and duration are reasonable for this test")
|
|
|
|
return recommendations
|
|
|
|
def _get_z_score(self, percentile: float) -> float:
|
|
"""Get z-score for given percentile (approximation)."""
|
|
# Common z-scores
|
|
z_scores = {
|
|
0.80: 0.84,
|
|
0.85: 1.04,
|
|
0.90: 1.28,
|
|
0.95: 1.645,
|
|
0.975: 1.96,
|
|
0.99: 2.33
|
|
}
|
|
return z_scores.get(percentile, 1.96)
|
|
|
|
def _standard_normal_cdf(self, z: float) -> float:
|
|
"""Approximate standard normal cumulative distribution function."""
|
|
# Using error function approximation
|
|
t = 1.0 / (1.0 + 0.2316419 * abs(z))
|
|
d = 0.3989423 * math.exp(-z * z / 2.0)
|
|
p = d * t * (0.3193815 + t * (-0.3565638 + t * (1.781478 + t * (-1.821256 + t * 1.330274))))
|
|
|
|
if z > 0:
|
|
return 1.0 - p
|
|
else:
|
|
return p
|
|
|
|
def _generate_test_decision(
|
|
self,
|
|
improvement: float,
|
|
is_significant_95: bool,
|
|
is_significant_90: bool,
|
|
total_visitors: int
|
|
) -> Dict[str, Any]:
|
|
"""Generate test decision and recommendation."""
|
|
if total_visitors < 1000:
|
|
return {
|
|
'decision': 'continue',
|
|
'rationale': 'Insufficient data - continue test to reach minimum sample size',
|
|
'action': 'Keep test running'
|
|
}
|
|
|
|
if is_significant_95:
|
|
if improvement > 0:
|
|
return {
|
|
'decision': 'implement_b',
|
|
'rationale': f'Variant B shows {improvement*100:.1f}% improvement with 95% confidence',
|
|
'action': 'Implement Variant B'
|
|
}
|
|
else:
|
|
return {
|
|
'decision': 'keep_a',
|
|
'rationale': 'Variant A performs better with 95% confidence',
|
|
'action': 'Keep current version (A)'
|
|
}
|
|
|
|
elif is_significant_90:
|
|
if improvement > 0:
|
|
return {
|
|
'decision': 'implement_b_cautiously',
|
|
'rationale': f'Variant B shows {improvement*100:.1f}% improvement with 90% confidence',
|
|
'action': 'Consider implementing B, monitor closely'
|
|
}
|
|
else:
|
|
return {
|
|
'decision': 'keep_a',
|
|
'rationale': 'Variant A performs better with 90% confidence',
|
|
'action': 'Keep current version (A)'
|
|
}
|
|
|
|
else:
|
|
return {
|
|
'decision': 'inconclusive',
|
|
'rationale': 'No statistically significant difference detected',
|
|
'action': 'Either keep A or test different hypothesis'
|
|
}
|
|
|
|
def _generate_tracking_recommendations(
|
|
self,
|
|
significance: Dict[str, Any],
|
|
progress: float,
|
|
test_type: str
|
|
) -> List[str]:
|
|
"""Generate recommendations for ongoing test."""
|
|
recommendations = []
|
|
|
|
if progress < 50:
|
|
recommendations.append(
|
|
f"Test is {progress:.0f}% complete - continue collecting data"
|
|
)
|
|
|
|
if progress >= 100:
|
|
if significance['statistical_analysis']['is_significant_95']:
|
|
recommendations.append(
|
|
"Sufficient data collected with significant results - ready to conclude test"
|
|
)
|
|
else:
|
|
recommendations.append(
|
|
"Sample size reached but no significant difference - consider extending test or concluding"
|
|
)
|
|
|
|
return recommendations
|
|
|
|
def _determine_next_steps(
|
|
self,
|
|
significance: Dict[str, Any],
|
|
progress: float
|
|
) -> str:
|
|
"""Determine next steps for test."""
|
|
if progress < 100:
|
|
return f"Continue test until reaching 100% sample size (currently {progress:.0f}%)"
|
|
|
|
decision = significance.get('decision', {}).get('decision', 'inconclusive')
|
|
|
|
if decision == 'implement_b':
|
|
return "Implement Variant B and monitor metrics for 2 weeks"
|
|
elif decision == 'keep_a':
|
|
return "Keep Variant A and design new test with different hypothesis"
|
|
else:
|
|
return "Test inconclusive - either keep A or design new test"
|
|
|
|
def _generate_test_insights(
|
|
self,
|
|
test: Dict[str, Any],
|
|
significance: Dict[str, Any],
|
|
results: Dict[str, Any]
|
|
) -> List[str]:
|
|
"""Generate insights from test results."""
|
|
insights = []
|
|
|
|
improvement = significance['improvement']['relative_percentage']
|
|
|
|
if significance['statistical_analysis']['is_significant_95']:
|
|
insights.append(
|
|
f"Strong evidence: Variant B {'improved' if improvement > 0 else 'decreased'} "
|
|
f"conversion by {abs(improvement):.1f}% with 95% confidence"
|
|
)
|
|
|
|
insights.append(
|
|
f"Tested {test['test_type']} changes: {test['hypothesis']}"
|
|
)
|
|
|
|
# Add context-specific insights
|
|
if test['test_type'] == 'icon' and improvement > 5:
|
|
insights.append(
|
|
"Icon change had substantial impact - visual first impression is critical"
|
|
)
|
|
|
|
return insights
|
|
|
|
def _create_implementation_plan(
|
|
self,
|
|
test: Dict[str, Any],
|
|
significance: Dict[str, Any]
|
|
) -> List[Dict[str, str]]:
|
|
"""Create implementation plan for winning variant."""
|
|
plan = []
|
|
|
|
if significance.get('decision', {}).get('decision') == 'implement_b':
|
|
plan.append({
|
|
'step': '1. Update store listing',
|
|
'details': f"Replace {test['test_type']} with Variant B across all platforms"
|
|
})
|
|
plan.append({
|
|
'step': '2. Monitor metrics',
|
|
'details': 'Track conversion rate for 2 weeks to confirm sustained improvement'
|
|
})
|
|
plan.append({
|
|
'step': '3. Document learnings',
|
|
'details': 'Record insights for future optimization'
|
|
})
|
|
|
|
return plan
|
|
|
|
def _extract_learnings(
|
|
self,
|
|
test: Dict[str, Any],
|
|
significance: Dict[str, Any]
|
|
) -> List[str]:
|
|
"""Extract key learnings from test."""
|
|
learnings = []
|
|
|
|
improvement = significance['improvement']['relative_percentage']
|
|
|
|
learnings.append(
|
|
f"Testing {test['test_type']} can yield {abs(improvement):.1f}% conversion change"
|
|
)
|
|
|
|
if test['test_type'] == 'title':
|
|
learnings.append(
|
|
"Title changes affect search visibility and user perception"
|
|
)
|
|
elif test['test_type'] == 'screenshot':
|
|
learnings.append(
|
|
"First 2-3 screenshots are critical for conversion"
|
|
)
|
|
|
|
return learnings
|
|
|
|
|
|
def plan_ab_test(
|
|
test_type: str,
|
|
variant_a: Dict[str, Any],
|
|
variant_b: Dict[str, Any],
|
|
hypothesis: str,
|
|
baseline_conversion: float
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Convenience function to plan an A/B test.
|
|
|
|
Args:
|
|
test_type: Type of test
|
|
variant_a: Control variant
|
|
variant_b: Test variant
|
|
hypothesis: Test hypothesis
|
|
baseline_conversion: Current conversion rate
|
|
|
|
Returns:
|
|
Complete test plan
|
|
"""
|
|
planner = ABTestPlanner()
|
|
|
|
test_design = planner.design_test(
|
|
test_type,
|
|
variant_a,
|
|
variant_b,
|
|
hypothesis
|
|
)
|
|
|
|
sample_size = planner.calculate_sample_size(
|
|
baseline_conversion,
|
|
planner.MIN_EFFECT_SIZES.get(test_type, 0.05)
|
|
)
|
|
|
|
return {
|
|
'test_design': test_design,
|
|
'sample_size_requirements': sample_size
|
|
}
|