Bootstrap

docker安装Gorse

gorse官方文档

推荐算法文档 (想了解的朋友可以自行观看~)

Gorse是一个用Go语言编写的开源推荐系统。Gorse的目标是成为一个通用的开源推荐系统,可以很容易地被引入到各种各样的在线服务中。通过将物品、用户和交互数据导入到Gorse中,系统将自动训练模型,为每个用户生成推荐,项目特点如下。

  • 多来源:根据热门、最新、基于用户、基于项目和协同过滤推荐项目。
  • AutoML:在后台自动搜索最佳推荐模型。
  • 分布式预测:支持单节点训练后,在推荐阶段进行水平扩展。
  • RESTful API:为数据 CRUD 和推荐请求公开 RESTful API。
  • 在线评估:根据最近插入的反馈分析在线推荐的性能。
  • 仪表板:提供用于数据管理、系统监控和集群状态检查的 GUI。

#docker部署

用Docker Compose或Docker Swarm管理小规模微服务集群是不错的选择。Gorse官方提供了Docker镜像:

Docker镜像版本镜像大小拉取次数
gorse-maste
gorse-server
gorse-worker
gorse-in-one

Docker 上的 Gorse 集群

Gorse 工作节点和服务节点具有水平可扩展性,增加服务节点的数量可以提高在线推荐吞吐量,增加工作节点的数量可以提高离线推荐吞吐量。

这是一个带有一个主节点、一个服务器节点和一个工作节点的 Gorse 集群的示例:

  • 使用以下内容创建docker-compose.yaml

version: "3.8"
services:
  redis:
    image: redis
    restart: unless-stopped
    ports:
      - 6380:6379

  mysql:
    image: mysql/mysql-server
    restart: unless-stopped
    ports:
      - 3307:3306
    environment:
      MYSQL_ROOT_PASSWORD: 123456
      MYSQL_DATABASE: gorse
      MYSQL_USER: admin
      MYSQL_PASSWORD: 123456
    volumes:
      - mysql-data:/var/lib/mysql

  # postgres:
  #   image: postgres:10.0
  #   ports:
  #     - 5432:5432
  #   environment:
  #     POSTGRES_DB: gorse
  #     POSTGRES_USER: gorse
  #     POSTGRES_PASSWORD: gorse_pass
  #   volumes:
  #     - postgres_data:/var/lib/postgresql/data

  # mongo:
  #   image: mongo:4.0
  #   ports:
  #     - 27017:27017
  #   environment:
  #     MONGO_INITDB_DATABASE: gorse
  #     MONGO_INITDB_ROOT_USERNAME: root
  #     MONGO_INITDB_ROOT_PASSWORD: password
  #   volumes:
  #     - mongo_data:/data/db

  # clickhouse:
  #   image: yandex/clickhouse-server:21.10
  #   ports:
  #     - 8123:8123
  #   environment:
  #     CLICKHOUSE_DB: gorse
  #     CLICKHOUSE_USER: gorse
  #     CLICKHOUSE_PASSWORD: gorse_pass
  #   volumes:
  #     - clickhouse_data:/var/lib/clickhouse

  worker:
    image: zhenghaoz/gorse-worker
    restart: unless-stopped
    ports:
      - 8089:8089
    command: >
      --master-host master --master-port 8086
      --http-host 0.0.0.0 --http-port 8089
      --log-path /var/log/gorse/worker.log
      --cache-path /var/lib/gorse/worker_cache.data
    volumes:
      - gorse_log:/var/log/gorse
      - worker_data:/var/lib/gorse
    depends_on:
      - master

  server:
    image: zhenghaoz/gorse-server
    restart: unless-stopped
    ports:
      - 8087:8087
    command: >
      --master-host master --master-port 8086
      --http-host 0.0.0.0 --http-port 8087
      --log-path /var/log/gorse/server.log
      --cache-path /var/lib/gorse/server_cache.data
    volumes:
      - gorse_log:/var/log/gorse
      - server_data:/var/lib/gorse
    depends_on:
      - master

  master:
    image: zhenghaoz/gorse-master
    restart: unless-stopped
    ports:
      - 8086:8086
      - 8088:8088
    environment:
      GORSE_CACHE_STORE: redis://redis:6379
      GORSE_DATA_STORE: mysql://admin:123456@tcp(mysql:3306)/gorse
      # GORSE_DATA_STORE: postgres://gorse:gorse_pass@postgres/gorse?sslmode=disable
      # GORSE_DATA_STORE: mongodb://root:password@mongo:27017/gorse?authSource=admin&connect=direct
      # GORSE_DATA_STORE: clickhouse://gorse:gorse_pass@clickhouse:8123/gorse
    command: >
      -c /etc/gorse/config.toml
      --log-path /var/log/gorse/master.log
      --cache-path /var/lib/gorse/master_cache.data
    volumes:
      - ./config/config.toml:/etc/gorse/config.toml
      - gorse_log:/var/log/gorse
      - master_data:/var/lib/gorse
    depends_on:
      - redis
      - mysql
      # - postgres
      # - mongo
      # - clickhouse

volumes:
  worker_data:
  server_data:
  master_data:
  gorse_log:
  mysql-data:
  # postgres_data:
  # mongo_data:
  # clickhouse_data:
[database]

# The database for caching, support Redis, MySQL, Postgres and MongoDB:
#   redis://<user>:<password>@<host>:<port>/<db_number>
#   rediss://<user>:<password>@<host>:<port>/<db_number>
#   redis+cluster://<user>:<password>@<host1>:<port1>,<host2>:<port2>,...,<hostN>:<portN>
#   postgres://bob:[email protected]:5432/mydb?sslmode=verify-full
#   postgresql://bob:[email protected]:5432/mydb?sslmode=verify-full
#   mongodb://[username:password@]host1[:port1][,...hostN[:portN]][/[defaultauthdb][?options]]
#   mongodb+srv://[username:password@]host1[:port1][,...hostN[:portN]][/[defaultauthdb][?options]]
cache_store = "redis://localhost:6379/0"

# The database for persist data, support MySQL, Postgres, ClickHouse and MongoDB:
#   mysql://[username[:password]@][protocol[(address)]]/dbname[?param1=value1&...&paramN=valueN]
#   postgres://bob:[email protected]:5432/mydb?sslmode=verify-full
#   postgresql://bob:[email protected]:5432/mydb?sslmode=verify-full
#   clickhouse://user:password@host[:port]/database?param1=value1&...&paramN=valueN
#   chhttp://user:password@host[:port]/database?param1=value1&...&paramN=valueN
#   chhttps://user:password@host[:port]/database?param1=value1&...&paramN=valueN
#   mongodb://[username:password@]host1[:port1][,...hostN[:portN]][/[defaultauthdb][?options]]
#   mongodb+srv://[username:password@]host1[:port1][,...hostN[:portN]][/[defaultauthdb][?options]]
data_store = "mysql://root:gorse_pass@tcp(localhost:3306)/gorse"

# The naming prefix for tables (collections, keys) in databases. The default value is empty.
table_prefix = ""

# The naming prefix for tables (collections, keys) in cache storage databases. The default value is `table_prefix`.
cache_table_prefix = ""

# The naming prefix for tables (collections, keys) in data storage databases. The default value is `table_prefix`.
data_table_prefix = ""

[master]

# GRPC port of the master node. The default value is 8086.
port = 8086

# gRPC host of the master node. The default values is "0.0.0.0".
host = "0.0.0.0"

# HTTP port of the master node. The default values is 8088.
http_port = 8088

# HTTP host of the master node. The default values is "0.0.0.0".
http_host = "0.0.0.0"

# AllowedDomains is a list of allowed values for Http Origin.
# The list may contain the special wildcard string ".*" ; all is allowed
# If empty all are allowed.
http_cors_domains = []

# AllowedMethods is either empty or has a list of http methods names. Checking is case-insensitive.
http_cors_methods = []

# Number of working jobs in the master node. The default value is 1.
n_jobs = 1

# Meta information timeout. The default value is 10s.
meta_timeout = "10s"

# Username for the master node dashboard.
dashboard_user_name = "admin"

# Password for the master node dashboard.
dashboard_password = "123456"

# Secret key for admin APIs (SSL required).
admin_api_key = ""

[server]

# Default number of returned items. The default value is 10.
default_n = 10

# Secret key for RESTful APIs (SSL required).
api_key = ""

# Clock error in the cluster. The default value is 5s.
clock_error = "5s"

# Insert new users while inserting feedback. The default value is true.
auto_insert_user = true

# Insert new items while inserting feedback. The default value is true.
auto_insert_item = true

# Server-side cache expire time. The default value is 10s.
cache_expire = "10s"

[recommend]

# The cache size for recommended/popular/latest items. The default value is 10.
cache_size = 100

# Recommended cache expire time. The default value is 72h.
cache_expire = "72h"

[recommend.data_source]

# The feedback types for positive events.
positive_feedback_types = ["collect","cart","alipay"]

# The feedback types for read events.
read_feedback_types = ["click"]

# The time-to-live (days) of positive feedback, 0 means disabled. The default value is 0.
positive_feedback_ttl = 0

# The time-to-live (days) of items, 0 means disabled. The default value is 0.
item_ttl = 0

[recommend.popular]

# The time window of popular items. The default values is 4320h.
popular_window = "720h"

[recommend.user_neighbors]

# The type of neighbors for users. There are three types:
#   similar: Neighbors are found by number of common labels.
#   related: Neighbors are found by number of common liked items.
#   auto: If a user have labels, neighbors are found by number of common labels.
#         If this user have no labels, neighbors are found by number of common liked items.
# The default value is "auto".
neighbor_type = "similar"

# Enable approximate user neighbor searching using vector index. The default value is true.
enable_index = true

# Minimal recall for approximate user neighbor searching. The default value is 0.8.
index_recall = 0.8

# Maximal number of fit epochs for approximate user neighbor searching vector index. The default value is 3.
index_fit_epoch = 3

[recommend.item_neighbors]

# The type of neighbors for items. There are three types:
#   similar: Neighbors are found by number of common labels.
#   related: Neighbors are found by number of common users.
#   auto: If a item have labels, neighbors are found by number of common labels.
#         If this item have no labels, neighbors are found by number of common users.
# The default value is "auto".
neighbor_type = "similar"

# Enable approximate item neighbor searching using vector index. The default value is true.
enable_index = true

# Minimal recall for approximate item neighbor searching. The default value is 0.8.
index_recall = 0.8

# Maximal number of fit epochs for approximate item neighbor searching vector index. The default value is 3.
index_fit_epoch = 3

[recommend.collaborative]

# Enable approximate collaborative filtering recommend using vector index. The default value is true.
enable_index = true

# Minimal recall for approximate collaborative filtering recommend. The default value is 0.9.
index_recall = 0.9

# Maximal number of fit epochs for approximate collaborative filtering recommend vector index. The default value is 3.
index_fit_epoch = 3

# The time period for model fitting. The default value is "60m".
model_fit_period = "60m"

# The time period for model searching. The default value is "360m".
model_search_period = "360m"

# The number of epochs for model searching. The default value is 100.
model_search_epoch = 100

# The number of trials for model searching. The default value is 10.
model_search_trials = 10

# Enable searching models of different sizes, which consume more memory. The default value is false.
enable_model_size_search = false

[recommend.replacement]

# Replace historical items back to recommendations. The default value is false.
enable_replacement = false

# Decay the weights of replaced items from positive feedbacks. The default value is 0.8.
positive_replacement_decay = 0.8

# Decay the weights of replaced items from read feedbacks. The default value is 0.6.
read_replacement_decay = 0.6

[recommend.offline]

# The time period to check recommendation for users. The default values is 1m.
check_recommend_period = "1m"

# The time period to refresh recommendation for inactive users. The default values is 120h.
refresh_recommend_period = "24h"

# Enable latest recommendation during offline recommendation. The default value is false.
enable_latest_recommend = true

# Enable popular recommendation during offline recommendation. The default value is false.
enable_popular_recommend = false

# Enable user-based similarity recommendation during offline recommendation. The default value is false.
enable_user_based_recommend = true

# Enable item-based similarity recommendation during offline recommendation. The default value is false.
enable_item_based_recommend = false

# Enable collaborative filtering recommendation during offline recommendation. The default value is true.
enable_collaborative_recommend = true

# Enable click-though rate prediction during offline recommendation. Otherwise, results from multi-way recommendation
# would be merged randomly. The default value is false.
enable_click_through_prediction = true

# The explore recommendation method is used to inject popular items or latest items into recommended result:
#   popular: Recommend popular items to cold-start users.
#   latest: Recommend latest items to cold-start users.
# The default values is { popular = 0.0, latest = 0.0 }.
explore_recommend = { popular = 0.1, latest = 0.2 }

[recommend.online]

# The fallback recommendation method is used when cached recommendation drained out:
#   item_based: Recommend similar items to cold-start users.
#   popular: Recommend popular items to cold-start users.
#   latest: Recommend latest items to cold-start users.
# Recommenders are used in order. The default values is ["latest"].
fallback_recommend = ["item_based", "latest"]

# The number of feedback used in fallback item-based similar recommendation. The default values is 10.
num_feedback_fallback_item_based = 10

[tracing]

# Enable tracing for REST APIs. The default value is false.
enable_tracing = false

# The type of tracing exporters should be one of "jaeger", "zipkin", "otlp" and "otlphttp". The default value is "jaeger".
exporter = "jaeger"

# The endpoint of tracing collector.
collector_endpoint = "http://localhost:14268/api/traces"

# The type of tracing sampler should be one of "always", "never" and "ratio". The default value is "always".
sampler = "always"

# The ratio of ratio based sampler. The default value is 1.
ratio = 1

以下是对这个配置文件内容的解释:

一、[database] 部分
  1. cache_store:指定用于缓存的数据库连接字符串,可以是 Redis 等。这里设置为连接本地的 Redis,端口为 6379,数据库编号为 0。

  2. data_store:指定用于持久化数据的数据库连接字符串,可以是 MySQL、Postgres、ClickHouse 或 MongoDB 等。这里设置为连接本地的 MySQL,用户为 root,密码为gorse_pass,数据库名为gorse

  3. table_prefixcache_table_prefixdata_table_prefix:分别为数据库表(集合、键)的命名前缀,默认值为空。

二、[master] 部分
  1. port:主节点的 gRPC 端口,默认值为 8086。

  2. host:主节点的 gRPC 主机地址,默认值为 “0.0.0.0”。

  3. http_port:主节点的 HTTP 端口,默认值为 8088。

  4. http_host:主节点的 HTTP 主机地址,默认值为 “0.0.0.0”。

  5. http_cors_domainshttp_cors_methods:分别为允许的 HTTP 跨域请求的域名列表和方法列表,默认为空表示允许所有。

  6. n_jobs:主节点中的工作任务数量,默认值为 1。

  7. meta_timeout:元信息超时时间,默认值为 10 秒。

  8. dashboard_user_namedashboard_password:主节点仪表盘的用户名和密码。

  9. admin_api_key:管理 API 的密钥(需要启用 SSL)。

三、[server] 部分
  1. default_n:默认返回的物品数量,默认值为 10。

  2. api_key:RESTful API 的密钥(需要启用 SSL)。

  3. clock_error:集群中的时钟误差,默认值为 5 秒。

  4. auto_insert_userauto_insert_item:在插入反馈时是否自动插入新用户和新物品,默认值为 true。

  5. cache_expire:服务器端缓存过期时间,默认值为 10 秒。

四、[recommend] 部分
  1. cache_size:推荐 / 热门 / 最新物品的缓存大小,默认值为 100。

  2. cache_expire:推荐缓存过期时间,默认值为 72 小时。

五、[recommend.data_source] 部分
  1. positive_feedback_typesread_feedback_types:分别为正反馈和读取事件的反馈类型列表。

  2. positive_feedback_ttlitem_ttl:正反馈和物品的生存时间(以天为单位),0 表示禁用,默认值为 0。

六、[recommend.popular] 部分
  1. popular_window:热门物品的时间窗口,默认值为 4320 小时(180 天)。

七、[recommend.user_neighbors] 和 [recommend.item_neighbors] 部分
  1. neighbor_type:用户或物品邻居的类型,可以是 “similar”(基于共同标签)、“related”(基于共同喜欢的物品或用户)或 “auto”(根据是否有标签自动选择),默认值为 “auto”。

  2. enable_index:是否启用近似邻居搜索的向量索引,默认值为 true。

  3. index_recall:近似邻居搜索的最小召回率,默认值为 0.8。

  4. index_fit_epoch:近似邻居搜索向量索引的最大拟合轮数,默认值为 3。

八、[recommend.collaborative] 部分
  1. enable_index:是否启用近似协同过滤推荐的向量索引,默认值为 true。

  2. index_recall:近似协同过滤推荐的最小召回率,默认值为 0.9。

  3. index_fit_epoch:近似协同过滤推荐向量索引的最大拟合轮数,默认值为 3。

  4. model_fit_period:模型拟合的时间周期,默认值为 “60m”(60 分钟)。

  5. model_search_period:模型搜索的时间周期,默认值为 “360m”(6 小时)。

  6. model_search_epoch:模型搜索的轮数,默认值为 100。

  7. model_search_trials:模型搜索的试验次数,默认值为 10。

  8. enable_model_size_search:是否启用搜索不同大小的模型,默认值为 false。

九、[recommend.replacement] 部分
  1. enable_replacement:是否启用将历史物品替换回推荐中,默认值为 false。

  2. positive_replacement_decayread_replacement_decay:分别为正反馈和读取反馈中被替换物品的权重衰减,默认值分别为 0.8 和 0.6。

十、[recommend.offline] 部分
  1. check_recommend_period:检查用户推荐的时间周期,默认值为 “1m”(1 分钟)。

  2. refresh_recommend_period:刷新不活跃用户推荐的时间周期,默认值为 “24h”(24 小时)。

  3. enable_latest_recommendenable_popular_recommendenable_user_based_recommendenable_item_based_recommendenable_collaborative_recommend:分别为在离线推荐中是否启用最新推荐、热门推荐、基于用户的相似推荐、基于物品的相似推荐和协同过滤推荐,默认值各不相同。

  4. enable_click_through_prediction:在离线推荐中是否启用点击率预测,默认值为 false。

  5. explore_recommend:探索推荐方法,用于向冷启动用户注入热门或最新物品,默认值为一定比例的热门和最新物品混合。

十一、[recommend.online] 部分
  1. fallback_recommend:当缓存的推荐耗尽时的回退推荐方法,可以是基于物品的相似推荐、热门推荐或最新推荐,默认值为 ["latest"]。

  2. num_feedback_fallback_item_based:在基于物品的相似回退推荐中使用的反馈数量,默认值为 10。

十二、[tracing] 部分
  1. enable_tracing:是否启用对 REST API 的跟踪,默认值为 false。

  2. exporter:跟踪导出器的类型,可以是 “jaeger”、“zipkin”、“otlp” 或 “otlphttp”,默认值为 “jaeger”。

  3. collector_endpoint:跟踪收集器的端点。

  4. sampler:跟踪采样器的类型,可以是 “always”、“never” 或 “ratio”,默认值为 “always”。

  5. ratio:基于比例的采样器的比例,默认值为 1。

启动所有服务:

docker-compose up -d

命令行参数

主节点的命令行参数:
参数默认值描述
--cache-pathworker_cache.data缓存文件路径
-c--config配置文件路径
--debug开启DEBUG日志模式
-h--help显示帮助信息
--log-path日志文件路径
--log-max-size日志文件的最大兆字节数
--log-max-age保留旧日志文件的最大天数
--log-max-backups保留的旧日志文件的最大数量
-v--version显示版本信息
服务节点的命令行参数:
参数默认值描述
--cache-pathworker_cache.data缓存文件路径
--debug开启DEBUG日志模式
-h--help显示帮助信息
--http-host127.0.0.1RESTful API 和 Prometheus
度量所在IP地址
--http-port8087RESTful API 和 Prometheus
度量所在端口
--log-path日志文件路径
--log-max-size日志文件的最大兆字节数
--log-max-age保留旧日志文件的最大天数
--log-max-backups保留的旧日志文件的最大数量
--master-host127.0.0.1主节点的IP地址
--master-port8086主节点的端口
-v--version显示版本信息
工作节点的命令行参数:
参数默认值描述
--cache-pathworker_cache.data缓存文件路径
--debug开启DEBUG日志模式
-h--help显示帮助信息
--http-host127.0.0.1Prometheus metrics HTTP接口IP地址
--http-port8089Prometheus metrics HTTP接口端口。
-j--jobs1工作线程数量
--log-path日志文件路径
--log-max-size日志文件的最大兆字节数
--log-max-age保留旧日志文件的最大天数
--log-max-backups保留的旧日志文件的最大数量
--master-host127.0.0.1主节点的IP地址
--master-port8086主节点的端口
-v--version显示版本信息

#快速开始(测试版本)

Playground模式是为初学者准备的。只需通过以下命令为GitHub仓库设置一个推荐系统。

Docker

docker run -p 8088:8088 zhenghaoz/gorse-in-one --playground

Playground模式将从GitRecopen in new window下载数据并导入到Gorse中。仪表板可以通过http://localhost:8088访问。

在“任务”页面上完成“查找临近的物品”任务后,尝试向Gorse插入一些反馈。假设Bob是GitHub中几个前端仓库的前端开发人员。我们把他的star行为的反馈写入Gorse。

import "github.com/zhenghaoz/gorse/client"
func main(){

gorse := client.NewGorseClient("http://127.0.0.1:8088", "")
gorse.InsertFeedback([]client.Feedback{
{FeedbackType: "star", UserId: "bob", ItemId: "vuejs:vue", Timestamp: "2022-02-24"},
{FeedbackType: "star", UserId: "bob", ItemId: "d3:d3", Timestamp: "2022-02-25"},
{FeedbackType: "star", UserId: "bob", ItemId: "dogfalo:materialize", Timestamp: "2022-02-26"},
{FeedbackType: "star", UserId: "bob", ItemId: "mozilla:pdf.js", Timestamp: "2022-02-27"},
{FeedbackType: "star", UserId: "bob", ItemId: "moment:moment", Timestamp: "2022-02-28"},})

}

然后从Gorse中获取10个推荐的物品。我们可以发现,前端相关的仓库被推荐给了Bob。

gorse.GetRecommend("bob", "", 10)
["mbostock:d3","nt1m:material-framework","mdbootstrap:vue-bootstrap-with-material-design","justice47:f2-vue","10clouds:cyclejs-cookie","academicpages:academicpages.github.io","accenture:alexia","addyosmani:tmi","1wheel:d3-starterkit","acdlite:redux-promise"]

最终的输出可能与示例不同,因为playground数据集会随时间而变化。

悦读

道可道,非常道;名可名,非常名。 无名,天地之始,有名,万物之母。 故常无欲,以观其妙,常有欲,以观其徼。 此两者,同出而异名,同谓之玄,玄之又玄,众妙之门。

;