3scale · davidor · May 9, 2018 · Apr 24, 2018 · Apr 24, 2018 · Apr 24, 2018
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 - OpenTracing support [PR #669](https://github.com/3scale/apicast/pull/669)
 - Default value for the `caching_type` attribute of the caching policy config schema [#691](https://github.com/3scale/apicast/pull/691), [THREESCALE-845](https://issues.jboss.org/browse/THREESCALE-845)
 - Generate new policy scaffold from the CLI [PR #682](https://github.com/3scale/apicast/pull/682)
+- 3scale batcher policy [PR #685](https://github.com/3scale/apicast/pull/685)
 
 ### Fixed
 

@@ -159,6 +159,13 @@ http {
 
   lua_shared_dict limiter 1m;
 
+  # This shared dictionaries are only used in the 3scale batcher policy.
+  # This is not ideal, but they'll need to be here until we allow policies to
+  # modify this template.
+  lua_shared_dict cached_auths 1m;
+  lua_shared_dict batched_reports 1m;
+  lua_shared_dict batched_reports_locks 1m;
+
   {% for file in "sites.d/*.conf" | filesystem %}
     {% include file %}
   {% endfor %}

@@ -14,6 +14,7 @@ local concat = table.concat
 local insert = table.insert
 local len = string.len
 local format = string.format
+local pairs = pairs
 
 local http_ng = require('resty.http_ng')
 local user_agent = require('apicast.user_agent')
@@ -120,6 +121,8 @@ local function auth_path(using_oauth)
          '/transactions/authorize.xml'
 end
 
+local report_path = '/transactions.xml'
+
 local function create_token_path(service_id)
   return format('/services/%s/oauth_access_tokens.xml', service_id)
 end
@@ -142,6 +145,32 @@ local function authorize_options(using_oauth)
   return { headers = headers }
 end
 
+local function add_transaction(transactions, index, cred_type, cred, reports)
+  local index_with_cred = format('transactions[%s][%s]', index, cred_type)
+  transactions[index_with_cred] = cred
+
+  for metric, value in pairs(reports) do
+    local index_with_metric = format('transactions[%s][usage][%s]', index, metric)
+    transactions[index_with_metric] = value
+  end
+end
+
+local function format_transactions(reports_batch)
+  local res = {}
+
+  -- Note: A service only supports one kind of credentials
+  local credentials_type = reports_batch.credentials_type
+  local reports = reports_batch.reports
+
+  local transaction_index = 0
+  for credential, metrics in pairs(reports) do
+    add_transaction(res, transaction_index, credentials_type, credential, metrics)
+    transaction_index = transaction_index + 1
+  end
+
+  return res
+end
+
 --- Call authrep (oauth_authrep) on backend.
 -- @tparam ?{table,...} query list of query parameters
 -- @treturn http_ng.response http response
@@ -168,6 +197,16 @@ function _M:authorize(...)
   return call_backend_transaction(self, auth_uri, authorize_options(using_oauth), ...)
 end
 
+function _M:report(reports_batch)
+  local http_client = self.http_client
+
+  local report_uri = build_url(self, report_path)
+  local report_body = format_transactions(reports_batch)
+  local res = http_client.post(report_uri, report_body)
+
+  return res
+end
+
 --- Calls backend to create an oauth token.
 -- @tparam ?{table, ...} list of query params (might include the token, ttl,
 --   app_id, and user_id)

@@ -0,0 +1,47 @@
+local _M = { }
+
+function _M.no_credentials(service)
+  ngx.log(ngx.INFO, 'no credentials provided for service ', service.id)
+  ngx.var.cached_key = nil
+  ngx.status = service.auth_missing_status
+  ngx.header.content_type = service.auth_missing_headers
+  ngx.print(service.error_auth_missing)
+  return ngx.exit(ngx.HTTP_OK)
+end
+
+function _M.authorization_failed(service)
+  ngx.log(ngx.INFO, 'authorization failed for service ', service.id)
+  ngx.var.cached_key = nil
+  ngx.status = service.auth_failed_status
+  ngx.header.content_type = service.auth_failed_headers
+  ngx.print(service.error_auth_failed)
+  return ngx.exit(ngx.HTTP_OK)
+end
+
+function _M.limits_exceeded(service)
+  ngx.log(ngx.INFO, 'limits exceeded for service ', service.id)
+  ngx.var.cached_key = nil
+  ngx.status = service.limits_exceeded_status
+  ngx.header.content_type = service.limits_exceeded_headers
+  ngx.print(service.error_limits_exceeded)
+  return ngx.exit(ngx.HTTP_OK)
+end
+
+function _M.no_match(service)
+  ngx.header.x_3scale_matched_rules = ''
+  ngx.log(ngx.INFO, 'no rules matched for service ', service.id)
+  ngx.var.cached_key = nil
+  ngx.status = service.no_match_status
+  ngx.header.content_type = service.no_match_headers
+  ngx.print(service.error_no_match)
+  return ngx.exit(ngx.HTTP_OK)
+end
+
+function _M.service_not_found(host)
+  ngx.status = 404
+  ngx.print('')
+  ngx.log(ngx.WARN, 'could not find service for host: ', host or ngx.var.host)
+  return ngx.exit(ngx.status)
+end
+
+return _M
@@ -0,0 +1,150 @@
+local backend_client = require('apicast.backend_client')
+local AuthsCache = require('auths_cache')
+local ReportsBatcher = require('reports_batcher')
+local policy = require('apicast.policy')
+local errors = require('apicast.errors')
+local reporter = require('reporter')
+local http_ng_resty = require('resty.http_ng.backend.resty')
+local semaphore = require('ngx.semaphore')
+
+local ipairs = ipairs
+
+local default_auths_ttl = 10
+local default_batch_reports_seconds = 10
+
+local _M = policy.new('Caching policy')
+
+local new = _M.new
+
+function _M.new(config)
+  local self = new(config)
+
+  local auths_ttl = config.auths_ttl or default_auths_ttl
+  self.auths_cache = AuthsCache.new(ngx.shared.cached_auths, auths_ttl)
+
+  self.reports_batcher = ReportsBatcher.new(
+    ngx.shared.batched_reports, 'batched_reports_locks')
+
+  self.batch_reports_seconds = config.batch_reports_seconds or
+                               default_batch_reports_seconds
+
+  self.report_timer_on = false
+
+  -- Semaphore used to ensure that only one timer is started per worker.
+  local semaphore_report_timer, err = semaphore.new(1)
+  if not semaphore_report_timer then
+    ngx.log(ngx.ERR, "Create semaphore failed: ", err)
+  end
+  self.semaphore_report_timer = semaphore_report_timer
+
+  return self
+end
+
+-- TODO: More policies are using this method. Move it to backend_client to
+-- avoid duplicating code.
+-- Converts a usage to the format expected by the 3scale backend client.
+local function format_usage(usage)
+  local res = {}
+
+  local usage_metrics = usage.metrics
+  local usage_deltas = usage.deltas
+
+  for _, metric in ipairs(usage_metrics) do
+    local delta = usage_deltas[metric]
+    res['usage[' .. metric .. ']'] = delta
+  end
+
+  return res
+end
+
+local function set_flags_to_avoid_auths_in_apicast(context)
+  context.skip_apicast_access = true
+  context.skip_apicast_post_action = true
+end
+
+local function report(_, service_id, backend, reports_batcher)
+  local reports = reports_batcher:get_all(service_id)
+
+  -- TODO: verify if we should limit the number of reports sent in a sigle req
+  reporter.report(reports, service_id, backend, reports_batcher)
+end
+
+-- This starts a timer on each worker.
+-- Starting a timer on each worker means that there will be more calls to
+-- 3scale backend, and the config param 'batch_report_seconds' becomes
+-- more confusing because the reporting frequency will be affected by the
+-- number of APIcast workers.
+-- If we started a timer just on one of the workers, it could die, and then,
+-- there would not be any reporting.
+local function ensure_report_timer_on(self, service_id, backend)
+  local check_timer = self.semaphore_report_timer:wait(0)
+
+  if check_timer then
+    if not self.report_timer_on then
+      ngx.timer.every(self.batch_reports_seconds, report,
+        service_id, backend, self.reports_batcher)
+
+      self.report_timer_on = true
+    end
+
+    self.semaphore_report_timer:post()
+  end
+end
+
+local function rejection_reason_from_headers(response_headers)
+  return response_headers and response_headers['3scale-rejection-reason']
+end
+
+local function error(service, rejection_reason)
+  if rejection_reason == 'limits_exceeded' then
+    return errors.limits_exceeded(service)
+  else
+    return errors.authorization_failed(service)
+  end
+end
+
+-- Note: when an entry in the cache expires, there might be several requests
+-- with those credentials and all of them will call auth() on backend with the
+-- same parameters until the auth status is cached again. In the future, we
+-- might want to introduce a mechanism to avoid this and reduce the number of
+-- calls to backend.
+function _M:access(context)
+  local backend = backend_client:new(context.service, http_ng_resty)
+  local usage = context.usage
+  local service = context.service
+  local service_id = service.id
+  local credentials = context.credentials
+
+  ensure_report_timer_on(self, service_id, backend)
+
+  local cached_auth = self.auths_cache:get(service_id, credentials, usage)
+
+  if not cached_auth then
+    local formatted_usage = format_usage(usage)
+    local backend_res = backend:authorize(formatted_usage, credentials)
+    local backend_status = backend_res.status
+
+    if backend_status == 200 then
+      self.auths_cache:set(service_id, credentials, usage, 200)
+      local to_batch = { service_id = service_id, credentials = credentials, usage = usage }
+      self.reports_batcher:add(to_batch.service_id, to_batch.credentials, to_batch.usage)
+    elseif backend_status >= 400 and backend_status < 500 then
+      local rejection_reason = rejection_reason_from_headers(backend_res.headers)
+      self.auths_cache:set(service_id, credentials, usage, backend_status, rejection_reason)
+      return error(service, rejection_reason)
+    else
+      return error(service)
+    end
+  else
+    if cached_auth.status == 200 then
+      local to_batch = { service_id = service_id, credentials = credentials, usage = usage }
+      self.reports_batcher:add(to_batch.service_id, to_batch.credentials, to_batch.usage)
+    else
+      return error(service, cached_auth.rejection_reason)
+    end
+  end
+
+  set_flags_to_avoid_auths_in_apicast(context)
+end
+
+return _M
@@ -0,0 +1,65 @@
+# 3scale Batcher Policy
+
+## Description
+
+The APIcast policy performs one call to the 3scale backend for each request that
+it receives. The goal of this policy is to reduce latency and increase
+throughput by significantly reducing the number of requests made to the 3scale
+backend. In order to achieve that, this policy caches authorization statuses and
+batches reports.
+
+## Technical details
+
+When the APIcast policy receives a request, it makes an 'authrep' call to
+backend. This call checks the credentials sent by APIcast, and also applies rate
+limiting over the metrics sent by APIcast. If the credentials are correct and
+rate limits not violated, backend also increases the counters of the metrics
+reported by APIcast. This counters are used both to show statistics in the
+3scale UI and also to apply the rate limits. This means that the rate limiting
+applied will not be accurate until the counter is updated. For limits defined
+for long windows of time (hour, day, etc.) this update lag if often irrelevant.
+However, it might be important to take it into account for limits defined for a
+small window of time (a per-minute limit, for example).
+
+This policy uses a cache for authorizations and batches reports. Also, it makes
+'authorize' and 'report' calls to backend instead of 'authrep' calls. On each
+request, the flow is as follows:
+
+1. The policy checks whether the credentials are cached. If they are, the policy
+uses the cached authorization status instead of calling 3scale's backend. When
+the credentials are not cached, it calls backend and caches the authorization
+status with a configurable TTL.
+
+2. Instead of reporting to 3scale's backend the metrics associated with the
+request, the policy accumulates their usages to report to backend in batches.
+
+Apart from that, there's a separate thread that reports to backend periodically.
+The time is configurable. This thread fetches all the batched reports and sends
+them to backend in a single call.
+
+This approach increases throughput for two reasons. First, it caches
+authorizations. This reduces the number of calls to the 3scale backend. Second,
+it batches the reports. This also helps reducing the number of calls made to the
+3scale backend, but more importantly, it also reduces the amount of work it
+needs to do because the policy already aggregated the metrics to report. Suppose
+that we define a mapping rule that increases the metric 'hits' by one on each
+request. Suppose also that we have 1000 requests per second. If we define a
+batching period of 10 seconds, this policy will report to 3scale backend just a
+'hits +10000' instead of 10000 separated 'hits +1'. This is very important,
+because from the 3scale backend perspective reporting a +10000 or a +1 to its
+database it's the same amount of work.
+
+Of course, reporting to 3scale in batches has a trade-off. Rate limiting loses
+accuracy. The reason is that while reports are accumulated, they're not being
+sent to backend and rate limits only take into account reports that have been
+stored in the 3scale backend database. In summary, going over the defined usage
+limits is easier. The APIcast policy reports to 3scale backend every time it
+receives a request. Reports are asynchronous and that means that we can go over
+the limits for a brief window of time. On the other hand, this policy reports
+every X seconds (configurable) to 3scale backend. The window of time in which we
+can get over the limits is wider in this case.
+
+The effectiveness of this policy will depend on the cache hit ratio. For use
+cases where the variety of services, apps, metrics, etc. is relatively low,
+caching and batching will be very effective and will increase the throughput of
+the system significantly.
@@ -0,0 +1,24 @@
+{
+  "$schema": "http://apicast.io/policy-v1/schema#manifest#",
+  "name": "3scale batcher",
+  "summary": "Caches auths from 3scale backend and batches reports.",
+  "description":
+    ["This policy caches authorizations from the 3scale backend ",
+     "and also reports in batches. Doing this is more efficient than ",
+     "authorizing and reporting on each request at the expense of losing ",
+     "accuracy in the rate limits."],
+  "version": "builtin",
+  "configuration": {
+    "type": "object",
+    "properties": {
+      "auths_ttl": {
+        "description": "TTL for cached auths in seconds",
+        "type": "integer"
+      },
+      "batch_report_seconds": {
+        "description": "Duration (in seconds) for batching reports",
+        "type": "integer"
+      }
+    }
+  }
+}