前几天写了一篇关于哈希算法的文章,起源就是在构思AB实验平台的时候,用到了哈希,所以对其做了深入的了解
AB实验平台是一般互联网做策略、样式实验会用到的一个系统,一般开启某个实验之后,需要对线上流量进行分流:客户端->实验平台->策略平台->应用服务,大概是这个链路
场景
线上流量策略分流,需要对不同人群做策略实验,最终效果好的一组需要推全到100%,在未推全之前,100%的流量会被分到原有流量+几个实验组;
比如我有一个实验:样式迭代,产品希望60%的流量走线上逻辑,剩下的40%,均分到4个实验组,这样可以细致的比对样式的效果,当然,如果一个用户命中了实验组A,那么他后续的行为,在实验组不调整的情况下,需要一直保持(有的人说希望调整了也保持,但是真的了解AB分流逻辑之后,真挺难的);
其实我们有自己的AB平台,但是我在想如果让我去实现,我会怎么做,便有了这一篇文章;
考虑的问题
1.如何保证一个用户对实验命中唯一性
在实验组不调整的情况下,PM希望用户分流的结果始终保持一致;
【策略】:hash算法,通过对imei进行哈希运算得到int的哈希值即可,可以加一个时间戳(实验修改时间),来控制用户的imei命中变化
2.如何按比例分流
【策略】取模运算,构建一个长度为100的bucket,然后通过对imei的哈希值运算结果对100取模,即可得到0-100的数,从而去命中数据区间,挑选实验组即可;由于哈希算法的特性,只要imei不变,时间戳(因子)不变,计算得到的哈希值是永远不会变的,所以取模结果,命中实验的结果也是不变的;
比如我的case是线上流量60%,其他实验组10%,10%,10%,10%,则可以构建一个数据区间:【0,60,70,80,90,100】
实战
代码
@Data
public class TrafficDistributor {
private String id;// 实验id
private String name;
private double isolation;// 线上流量
private List<Experiment> experiments;// 实验组
private Table<String, String, Set<String>> whiteTable = HashBasedTable.create();
private static Date updateTime = new Date(1719844946L);//2024/7/1 22:42:26
public TrafficDistributor(String id, String name, double isolation, List<Experiment> experiments, Table<String, String, Set<String>> whiteTable) {
this.id = id;
this.name = name;
this.isolation = isolation;
this.experiments = experiments;
this.whiteTable = whiteTable;
}
public static void main(String[] args) {
// 实验id
String id = "new_style";
// 1.初始化白名单==》正式环境从db中拿出来初始化
Table<String, String, Set<String>> white = HashBasedTable.create();
white.put(id, "00", Sets.newHashSetWithExpectedSize(0));
white.put(id, "01", Set.of("whitelist_imei_1", "whitelist_imei_2"));
white.put(id, "02", Set.of("whitelist_imei_3", "whitelist_imei_4"));
white.put(id, "11", Set.of("whitelist_imei_5", "whitelist_imei_6"));
white.put(id, "12", Set.of("whitelist_imei_7", "whitelist_imei_8"));
// 2.初始化实验组
Experiment experiment1 = new Experiment("01", "对照组01", 10);
Experiment experiment2 = new Experiment("02", "对照组02", 10);
Experiment experiment3 = new Experiment("11", "实验组01", 10);
Experiment experiment4 = new Experiment("12", "实验组02", 10);
List<Experiment> experiments = List.of(experiment1, experiment2, experiment3, experiment4);
// 3.初始化分流器
TrafficDistributor distributor = new TrafficDistributor(id, "新样式实验", 60, experiments, white);
// 4.模拟分流
for (int times = 0; times < 100; times++) {
int index = 1, size = 100;
Map<String, Integer> result = Maps.newHashMap();
List<Long> cost = Lists.newArrayListWithCapacity(size);
while (index <= size) {
String imei = RandomStringUtils.randomAlphanumeric(64);
long start = System.currentTimeMillis();
// 核心分流
Experiment experiment = distributor.allocate(id, imei);
if (experiment != null) {
// 实验组
result.compute(experiment.getId(), (eid, count) -> (count == null) ? 1 : count + 1);
} else {
// 线上
int value = result.getOrDefault("00", 0);
result.put("00", value + 1);
}
cost.add(System.currentTimeMillis() - start);
index++;
}
System.out.println("耗时:" + cost.stream().mapToLong(Long::longValue).sum() + ";结果:" + JSON.toJSONString(result));
}
}
/**
* 流量分配: 白名单 - 实验组 - 线上流量
*/
public Experiment allocate(String id, String imei) {
Experiment whiteListShot = whiteListShot(id, imei);
if (whiteListShot != null) {
return whiteListShot;
}
List<Double> weights = experiments.stream().map(Experiment::getWeight).collect(Collectors.toList());
int groupId = distributeTraffic(weights, imei);
if (groupId < 0) {
// 线上流量
return null;
}
// 实验组流量
return experiments.get(groupId);
}
/**
* 根据每个实验组的权重配比,判断最终流量应该分配到哪个实验组。
*
* @param weights 每个实验组的权重值数组。
* @param imei imei
* @return 分配流量的实验组索引
*/
public int distributeTraffic(List<Double> weights, String imei) {
if (CollectionUtils.isEmpty(weights)) {
return -1;
}
double totalWeight = 100;// 总权重100%
double testWeight = weights.stream().mapToDouble(Double::doubleValue).sum();// 实验组总权重
if (totalWeight != (isolation + testWeight)) {
throw new IllegalArgumentException("实验组和对照组流量分配存在问题");
}
// imei+时间戳(因子)生成hash
int hash = HashUtils.hashcode(imei + updateTime.getTime());
return bucketShot(weights, hash % totalWeight);
}
/**
* 哈希值进行取模定位后,命中的实验
*/
public int bucketShot(List<Double> weights, double bucketIndex) {
List<Double> range = Lists.newArrayListWithCapacity(weights.size() + 1);
range.add(isolation);
double pre = range.get(0);
for (double weight : weights) {
range.add(pre + weight);
pre = pre + weight;
}
for (int i = 0; i < range.size(); i++) {
if (bucketIndex <= range.get(i)) {
return i - 1;
}
}
throw new IllegalArgumentException("实验组和对照组流量分配存在问题");
}
/**
* 白名单命中
*
* @param id 实验id
* @param imei imei
* @return 实验组
*/
public Experiment whiteListShot(String id, String imei) {
assert whiteTable != null && !whiteTable.isEmpty();
if (!whiteTable.containsRow(id)) {
throw new IllegalArgumentException("实验id=" + id + "不存在");
}
Map<String, Set<String>> experimentData = whiteTable.row(id);
for (Map.Entry<String, Set<String>> entry : experimentData.entrySet()) {
Set<String> values = entry.getValue();
if (!CollectionUtils.isEmpty(values) && values.contains(imei)) {
String key = entry.getKey();
return experiments.stream().filter(test -> test.getId().equals(key)).findFirst().orElse(null);
}
}
return null;
}
// 其他方法保持不变
@Data
@AllArgsConstructor
static class Experiment {
private String id;
private String name;
private double weight;
}
public class HashUtils {
/**
* hashCode方法
*/
public static int hashcode(Object obj) {
final int p = 16777619;
int hash = (int) 2166136261L;
String str = obj.toString();
for (int i = 0; i < str.length(); i++)
hash = (hash ^ str.charAt(i)) * p;
hash += hash << 13;
hash ^= hash >> 7;
hash += hash << 3;
hash ^= hash >> 17;
hash += hash << 5;
if (hash < 0)
hash = Math.abs(hash);
return hash;
}
}
分析
耗时:34;结果:{"11":11,"00":61,"01":7,"12":9,"02":12}
耗时:4;结果:{"11":8,"00":61,"12":4,"01":9,"02":18}
耗时:2;结果:{"11":9,"00":60,"01":10,"12":7,"02":14}
耗时:2;结果:{"11":11,"00":54,"01":10,"12":14,"02":11}
耗时:1;结果:{"11":9,"00":65,"12":12,"01":8,"02":6}
耗时:3;结果:{"11":9,"00":61,"01":6,"12":12,"02":12}
耗时:2;结果:{"11":9,"00":56,"12":8,"01":17,"02":10}
耗时:2;结果:{"11":7,"00":61,"12":7,"01":10,"02":15}
耗时:1;结果:{"11":8,"00":58,"01":13,"12":4,"02":17}
耗时:0;结果:{"11":5,"00":72,"01":9,"12":5,"02":9}
耗时:0;结果:{"11":2,"00":70,"12":14,"01":8,"02":6}
耗时:2;结果:{"11":7,"00":63,"12":10,"01":10,"02":10}
耗时:0;结果:{"11":8,"00":60,"12":11,"01":10,"02":11}
耗时:1;结果:{"11":9,"00":60,"12":11,"01":13,"02":7}
耗时:0;结果:{"11":12,"00":71,"12":5,"01":9,"02":3}
耗时:1;结果:{"11":12,"00":57,"01":11,"12":11,"02":9}
耗时:0;结果:{"11":8,"00":62,"01":14,"12":7,"02":9}
耗时:1;结果:{"11":10,"00":64,"12":6,"01":9,"02":11}
耗时:2;结果:{"11":5,"00":73,"12":7,"01":9,"02":6}
耗时:0;结果:{"11":9,"00":68,"01":6,"12":9,"02":8}
耗时:1;结果:{"11":12,"00":63,"01":10,"12":4,"02":11}
耗时:1;结果:{"11":15,"00":59,"01":8,"12":9,"02":9}
耗时:0;结果:{"11":10,"00":66,"01":8,"12":6,"02":10}
耗时:1;结果:{"11":8,"00":64,"01":10,"12":6,"02":12}
耗时:2;结果:{"11":11,"00":63,"01":8,"12":8,"02":10}
耗时:1;结果:{"11":5,"00":66,"12":9,"01":12,"02":8}
耗时:3;结果:{"11":8,"00":67,"12":10,"01":9,"02":6}
耗时:1;结果:{"11":9,"00":54,"12":16,"01":7,"02":14}
耗时:0;结果:{"11":11,"00":63,"12":5,"01":11,"02":10}
耗时:1;结果:{"11":10,"00":59,"01":12,"12":8,"02":11}
耗时:1;结果:{"11":12,"00":62,"12":11,"01":8,"02":7}
耗时:0;结果:{"11":8,"00":59,"12":8,"01":13,"02":12}
耗时:1;结果:{"11":12,"00":51,"12":11,"01":15,"02":11}
耗时:1;结果:{"11":1,"00":72,"01":10,"12":8,"02":9}
耗时:1;结果:{"11":8,"00":55,"01":18,"12":9,"02":10}
耗时:0;结果:{"11":6,"00":54,"01":22,"12":5,"02":13}
耗时:1;结果:{"11":9,"00":58,"12":11,"01":11,"02":11}
耗时:1;结果:{"11":15,"00":57,"12":12,"01":3,"02":13}
耗时:1;结果:{"11":6,"00":66,"12":10,"01":11,"02":7}
耗时:0;结果:{"11":13,"00":61,"12":12,"01":8,"02":6}
耗时:0;结果:{"11":8,"00":61,"12":11,"01":10,"02":10}
耗时:0;结果:{"11":10,"00":57,"01":10,"12":12,"02":11}
耗时:1;结果:{"11":6,"00":62,"12":12,"01":11,"02":9}
耗时:5;结果:{"11":9,"00":64,"12":8,"01":10,"02":9}
耗时:0;结果:{"11":15,"00":54,"12":8,"01":9,"02":14}
耗时:0;结果:{"11":12,"00":62,"01":8,"12":10,"02":8}
耗时:0;结果:{"11":9,"00":63,"12":6,"01":12,"02":10}
耗时:0;结果:{"11":9,"00":59,"01":10,"12":9,"02":13}
耗时:1;结果:{"11":10,"00":58,"01":7,"12":14,"02":11}
耗时:1;结果:{"11":9,"00":73,"01":8,"12":2,"02":8}
耗时:0;结果:{"11":14,"00":62,"12":7,"01":10,"02":7}
耗时:1;结果:{"11":12,"00":55,"12":10,"01":12,"02":11}
耗时:0;结果:{"11":5,"00":59,"01":7,"12":17,"02":12}
耗时:0;结果:{"11":10,"00":59,"12":7,"01":10,"02":14}
耗时:1;结果:{"11":11,"00":54,"01":17,"12":8,"02":10}
耗时:0;结果:{"11":8,"00":65,"12":8,"01":9,"02":10}
耗时:1;结果:{"11":13,"00":61,"12":8,"01":9,"02":9}
耗时:1;结果:{"11":6,"00":67,"01":10,"12":11,"02":6}
耗时:1;结果:{"11":7,"00":61,"12":8,"01":10,"02":14}
耗时:0;结果:{"11":6,"00":63,"12":8,"01":10,"02":13}
耗时:0;结果:{"11":9,"00":62,"12":9,"01":9,"02":11}
耗时:1;结果:{"11":5,"00":65,"01":8,"12":11,"02":11}
耗时:0;结果:{"11":11,"00":52,"12":9,"01":15,"02":13}
耗时:0;结果:{"11":14,"00":66,"01":4,"12":7,"02":9}
耗时:0;结果:{"11":12,"00":54,"12":8,"01":8,"02":18}
耗时:1;结果:{"11":9,"00":64,"12":8,"01":10,"02":9}
耗时:1;结果:{"11":9,"00":65,"01":3,"12":11,"02":12}
耗时:0;结果:{"11":5,"00":67,"01":7,"12":12,"02":9}
耗时:0;结果:{"11":13,"00":50,"01":12,"12":11,"02":14}
耗时:1;结果:{"11":18,"00":55,"12":7,"01":10,"02":10}
耗时:0;结果:{"11":5,"00":64,"01":12,"12":5,"02":14}
耗时:0;结果:{"11":10,"00":68,"12":6,"01":7,"02":9}
耗时:1;结果:{"11":9,"00":71,"12":4,"01":6,"02":10}
耗时:0;结果:{"11":8,"00":62,"12":9,"01":9,"02":12}
耗时:0;结果:{"11":10,"00":64,"12":8,"01":9,"02":9}
耗时:0;结果:{"11":9,"00":57,"12":10,"01":9,"02":15}
耗时:0;结果:{"11":10,"00":60,"12":13,"01":9,"02":8}
耗时:0;结果:{"11":12,"00":66,"01":5,"12":9,"02":8}
耗时:0;结果:{"11":6,"00":58,"01":11,"12":13,"02":12}
耗时:1;结果:{"11":10,"00":62,"01":12,"12":9,"02":7}
耗时:1;结果:{"11":7,"00":66,"12":11,"01":7,"02":9}
耗时:0;结果:{"11":10,"00":63,"12":9,"01":11,"02":7}
耗时:1;结果:{"11":8,"00":61,"12":10,"01":12,"02":9}
耗时:0;结果:{"11":8,"00":62,"12":6,"01":10,"02":14}
耗时:0;结果:{"11":7,"00":68,"12":8,"01":11,"02":6}
耗时:0;结果:{"11":11,"00":54,"01":11,"12":16,"02":8}
耗时:0;结果:{"11":7,"00":68,"01":10,"12":6,"02":9}
耗时:0;结果:{"11":7,"00":65,"12":7,"01":8,"02":13}
耗时:0;结果:{"11":8,"00":69,"01":8,"12":5,"02":10}
耗时:0;结果:{"11":15,"00":60,"01":6,"12":11,"02":8}
耗时:0;结果:{"11":9,"00":70,"01":6,"12":7,"02":8}
耗时:0;结果:{"11":14,"00":62,"12":7,"01":10,"02":7}
耗时:0;结果:{"11":11,"00":64,"12":7,"01":7,"02":11}
耗时:1;结果:{"11":6,"00":56,"12":14,"01":10,"02":14}
耗时:0;结果:{"11":7,"00":64,"12":8,"01":11,"02":10}
耗时:1;结果:{"11":14,"00":65,"12":4,"01":10,"02":7}
耗时:0;结果:{"11":13,"00":59,"12":7,"01":13,"02":8}
耗时:1;结果:{"11":5,"00":65,"12":8,"01":14,"02":8}
耗时:1;结果:{"11":12,"00":54,"01":11,"12":10,"02":13}
耗时:1;结果:{"11":10,"00":56,"12":11,"01":11,"02":12}
Process finished with exit code 0
- 我手写的hashCode方法,包括随机字符串生成的imei仍然存在实验组分类偏向的情况,中间尝试过使用一致性哈希解决偏移,但是又解决不了加权按比例分配的问题,不知道各位同仁有没有什么好的建议;我怀疑是随机字符串的偏向性问题,后面看看有没有办法解决
- 取模算法是个宝藏,既可以精准定位,也可以利用随机性,来做区间命中
- updateTime它是个变化因子,如果实验有新增实验组或者流量调整,可以利用它来控制imei实验组变化
参考资料
- https://zhuanlan.zhihu.com/p/404232432
- https://blog.csdn.net/SmartCodeTech/article/details/113698568?spm=1001.2101.3001.6650.2&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromBaidu%7ECtr-2-113698568-blog-131205090.235%5Ev43%5Epc_blog_bottom_relevance_base8&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromBaidu%7ECtr-2-113698568-blog-131205090.235%5Ev43%5Epc_blog_bottom_relevance_base8&utm_relevant_index=5