From 425f546ed76e8324e779c1e81e04e25e019e2d02 Mon Sep 17 00:00:00 2001 From: Nick Muerdter Date: Wed, 23 Mar 2016 11:57:50 -0600 Subject: [PATCH] Change structure of how we store the various analytics time fields. Instead of storing the individual components, we're now storing a full date or timestamp string, giving the starting date of that block. So, for example, for 2016-03-23, the request_at_tz_year would be 2016-01-01, the request_at_tz_month would be 2016-03-01, etc. While the previous storage of the individual components was more efficient (from a space perspective), and less duplicative, the reason for this switch is for a couple reasons: - In order to query the previous individual fields, we had to concatenate them at query time. Skipping concatenation actually gives us a decent performance boost when the queries have to fallback to Presto (in an example query, it was consistently taking 60 seconds with concating, vs 45 seconds without concating). - The bigger stumbling block we ran into was that Presto doesn't support CASTing to CHARs. This meant various queries couldn't fallback to Presto. We could cast to VARCHAR in Presto, but then the sorting the dates wouldn't work properly, since the integers weren't zero-padded (so 12 would come before 2). - Despite the less efficient storage (although, I don't think it should actually make much of a huge difference in the columnar store), this approach does greatly simplify querying (since you only ever have to GROUP BY a single time field). --- .../elasticsearch-import/README.md | 4 +-- .../hadoop_analytics/DayWorker.java | 28 +++++++++++-------- .../schema/src/main/resources/log.avsc | 10 +++---- .../proxy/hooks/log_initial_proxy.lua | 25 ++++++++++++----- .../web-app/app/models/log_search/sql.rb | 12 ++++---- 5 files changed, 47 insertions(+), 32 deletions(-) diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/README.md b/src/api-umbrella/hadoop-analytics/elasticsearch-import/README.md index b77c6fd81..c2ac88558 100644 --- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/README.md +++ b/src/api-umbrella/hadoop-analytics/elasticsearch-import/README.md @@ -5,9 +5,9 @@ $ sudo -u hive java -Dapiumbrella.page_size=10000 -Dapiumbrella.elasticsearch_ur $ sudo -u hive hive hive> CREATE DATABASE api_umbrella; -hive> CREATE EXTERNAL TABLE api_umbrella.logs(request_at BIGINT, id STRING, request_at_tz_offset INT, request_at_tz_hour SMALLINT, request_at_tz_minute SMALLINT, user_id STRING, denied_reason STRING, request_method STRING, request_url_scheme STRING, request_url_host STRING, request_url_port INT, request_url_path STRING, request_url_path_level1 STRING, request_url_path_level2 STRING, request_url_path_level3 STRING, request_url_path_level4 STRING, request_url_path_level5 STRING, request_url_path_level6 STRING, request_url_query STRING, request_ip STRING, request_ip_country STRING, request_ip_region STRING, request_ip_city STRING, request_ip_lat DOUBLE, request_ip_lon DOUBLE, request_user_agent STRING, request_user_agent_type STRING, request_user_agent_family STRING, request_size INT, request_accept STRING, request_accept_encoding STRING, request_content_type STRING, request_connection STRING, request_origin STRING, request_referer STRING, request_basic_auth_username STRING, response_status SMALLINT, response_content_type STRING, response_content_length INT, response_content_encoding STRING, response_transfer_encoding STRING, response_server STRING, response_cache STRING, response_age INT, response_size INT, timer_response DOUBLE, timer_backend_response DOUBLE, timer_internal DOUBLE, timer_proxy_overhead DOUBLE, log_imported BOOLEAN) PARTITIONED BY (request_at_tz_year SMALLINT, request_at_tz_month TINYINT, request_at_tz_week TINYINT, request_at_tz_date DATE) STORED AS ORC LOCATION '/apps/api-umbrella/logs'; +hive> CREATE EXTERNAL TABLE api_umbrella.logs(request_at BIGINT, id STRING, request_at_tz_offset INT, request_at_tz_hour STRING, request_at_tz_minute STRING, user_id STRING, denied_reason STRING, request_method STRING, request_url_scheme STRING, request_url_host STRING, request_url_port INT, request_url_path STRING, request_url_path_level1 STRING, request_url_path_level2 STRING, request_url_path_level3 STRING, request_url_path_level4 STRING, request_url_path_level5 STRING, request_url_path_level6 STRING, request_url_query STRING, request_ip STRING, request_ip_country STRING, request_ip_region STRING, request_ip_city STRING, request_ip_lat DOUBLE, request_ip_lon DOUBLE, request_user_agent STRING, request_user_agent_type STRING, request_user_agent_family STRING, request_size INT, request_accept STRING, request_accept_encoding STRING, request_content_type STRING, request_connection STRING, request_origin STRING, request_referer STRING, request_basic_auth_username STRING, response_status SMALLINT, response_content_type STRING, response_content_length INT, response_content_encoding STRING, response_transfer_encoding STRING, response_server STRING, response_cache STRING, response_age INT, response_size INT, timer_response DOUBLE, timer_backend_response DOUBLE, timer_internal DOUBLE, timer_proxy_overhead DOUBLE, log_imported BOOLEAN) PARTITIONED BY (request_at_tz_year DATE, request_at_tz_month DATE, request_at_tz_week DATE, request_at_tz_date DATE) STORED AS ORC LOCATION '/apps/api-umbrella/logs'; hive> exit; -$ sudo -u hive hadoop fs -ls -R /apps/api-umbrella/logs | grep "\.orc$" | grep -o "[^ ]*$" | sort -V | sed -e "s/.*request_at_tz_year=\([0-9]\+\).*request_at_tz_month=\([0-9]\+\).*request_at_tz_week=\([0-9]\+\).*request_at_tz_date=\([0-9\-]\+\).*/ALTER TABLE api_umbrella.logs ADD IF NOT EXISTS PARTITION(request_at_tz_year=\1, request_at_tz_month=\2, request_at_tz_week=\3, request_at_tz_date='\4');/" > /tmp/api_umbrella_load_partitions.sql +$ sudo -u hive hadoop fs -ls -R /apps/api-umbrella/logs | grep "\.orc$" | grep -o "[^ ]*$" | sort -V | sed -e "s/.*request_at_tz_year=\([0-9\-]\+\).*request_at_tz_month=\([0-9\-]\+\).*request_at_tz_week=\([0-9\-]\+\).*request_at_tz_date=\([0-9\-]\+\).*/ALTER TABLE api_umbrella.logs ADD IF NOT EXISTS PARTITION(request_at_tz_year='\1', request_at_tz_month='\2', request_at_tz_week='\3', request_at_tz_date='\4');/" > /tmp/api_umbrella_load_partitions.sql $ sudo -u hive hive -f /tmp/api_umbrella_load_partitions.sql && rm /tmp/api_umbrella_load_partitions.sql $ curl 'http://ADMIN:KYLIN@localhost:7070/kylin/api/projects' -H 'Content-Type: application/json;charset=UTF-8' --data-binary '{"name":"api_umbrella","description":""}' diff --git a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/DayWorker.java b/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/DayWorker.java index 7786faaf5..288917492 100644 --- a/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/DayWorker.java +++ b/src/api-umbrella/hadoop-analytics/elasticsearch-import/src/main/java/apiumbrella/hadoop_analytics/DayWorker.java @@ -36,6 +36,7 @@ import org.joda.time.DateTime; import org.joda.time.DateTimeZone; import org.joda.time.Period; +import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.joda.time.format.ISODateTimeFormat; import org.slf4j.Logger; @@ -66,7 +67,8 @@ public class DayWorker implements Runnable { private static WriterOptions orcWriterOptions; private Writer orcWriter; DateTimeFormatter dateTimeParser = ISODateTimeFormat.dateTimeParser(); - DateTimeFormatter dateTimeFormatter = ISODateTimeFormat.dateTime().withZone(App.TIMEZONE); + DateTimeFormatter dateTimeFormatter = + DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").withZone(App.TIMEZONE); DateTimeFormatter dateFormatter = ISODateTimeFormat.date().withZone(App.TIMEZONE); public DayWorker(App app, DateTime date) { @@ -193,11 +195,11 @@ private Writer getOrcWriter() throws IOException { if (this.orcWriter == null) { String date = dateFormatter.print(dayStartTime); // Create a new file in /dir/YYYY/MM/WW/YYYY-MM-DD.par - Path path = new Path( - App.HDFS_URI + Paths.get(App.DIR, "request_at_tz_year=" + dayStartTime.toString("YYYY"), - "request_at_tz_month=" + dayStartTime.getMonthOfYear(), - "request_at_tz_week=" + dayStartTime.getWeekOfWeekyear(), - "request_at_tz_date=" + date, date + ".orc")); + Path path = new Path(App.HDFS_URI + Paths.get(App.DIR, + "request_at_tz_year=" + dateFormatter.print(dayStartTime.withDayOfYear(1)), + "request_at_tz_month=" + dateFormatter.print(dayStartTime.withDayOfMonth(1)), + "request_at_tz_week=" + dateFormatter.print(dayStartTime.withDayOfWeek(1)), + "request_at_tz_date=" + date, date + ".orc")); this.orcWriter = OrcFile.createWriter(path, getOrcWriterOptions()); } @@ -266,12 +268,14 @@ private void processHit(JsonObject hit) throws Exception { DateTime requestAt = this.parseTimestamp(value); log.put("request_at", requestAt.getMillis()); log.put("request_at_tz_offset", App.TIMEZONE.getOffset(requestAt.getMillis())); - log.put("request_at_tz_year", requestAt.getYear()); - log.put("request_at_tz_month", requestAt.getMonthOfYear()); - log.put("request_at_tz_week", requestAt.getWeekOfWeekyear()); - log.put("request_at_tz_date", this.dateFormatter.print(requestAt)); - log.put("request_at_tz_hour", requestAt.getHourOfDay()); - log.put("request_at_tz_minute", requestAt.getMinuteOfHour()); + log.put("request_at_tz_year", dateFormatter.print(requestAt.withDayOfYear(1))); + log.put("request_at_tz_month", dateFormatter.print(requestAt.withDayOfMonth(1))); + log.put("request_at_tz_week", dateFormatter.print(requestAt.withDayOfWeek(1))); + log.put("request_at_tz_date", dateFormatter.print(requestAt)); + log.put("request_at_tz_hour", dateTimeFormatter + .print(requestAt.withMinuteOfHour(0).withSecondOfMinute(0).withMillisOfSecond(0))); + log.put("request_at_tz_minute", + dateTimeFormatter.print(requestAt.withSecondOfMinute(0).withMillisOfSecond(0))); value = null; break; case "request_ip_location": diff --git a/src/api-umbrella/hadoop-analytics/schema/src/main/resources/log.avsc b/src/api-umbrella/hadoop-analytics/schema/src/main/resources/log.avsc index 8856f6ef2..4dc698848 100644 --- a/src/api-umbrella/hadoop-analytics/schema/src/main/resources/log.avsc +++ b/src/api-umbrella/hadoop-analytics/schema/src/main/resources/log.avsc @@ -16,15 +16,15 @@ }, { "name": "request_at_tz_year", - "type": "int" + "type": "string" }, { "name": "request_at_tz_month", - "type": "int" + "type": "string" }, { "name": "request_at_tz_week", - "type": "int" + "type": "string" }, { "name": "request_at_tz_date", @@ -32,11 +32,11 @@ }, { "name": "request_at_tz_hour", - "type": "int" + "type": "string" }, { "name": "request_at_tz_minute", - "type": "int" + "type": "string" }, { "name": "user_id", diff --git a/src/api-umbrella/proxy/hooks/log_initial_proxy.lua b/src/api-umbrella/proxy/hooks/log_initial_proxy.lua index 94a28142c..bf2c3bf48 100644 --- a/src/api-umbrella/proxy/hooks/log_initial_proxy.lua +++ b/src/api-umbrella/proxy/hooks/log_initial_proxy.lua @@ -155,14 +155,25 @@ local function log_request() local utc_sec = data["request_at"] local tz_offset = timezone:find_current(utc_sec).gmtoff local tz_sec = utc_sec + tz_offset - local tz_time = os.date("!*t", tz_sec) + local tz_time = os.date("!%Y-%m-%d %H:%M:00", tz_sec) + + -- Determine the first day in the ISO week (the most recent Monday). + local tz_week = luatz.gmtime(tz_sec) + if tz_week.wday == 1 then + tz_week.day = tz_week.day - 6 + tz_week:normalize() + elseif tz_week.wday > 2 then + tz_week.day = tz_week.day - tz_week.wday + 2 + tz_week:normalize() + end + data["request_at_tz_offset"] = tz_offset * 1000 - data["request_at_tz_year"] = tz_time["year"] - data["request_at_tz_month"] = tz_time["month"] - data["request_at_tz_week"] = tonumber(os.date("!%V", tz_sec)) - data["request_at_tz_date"] = os.date("!%Y-%m-%d", tz_sec) - data["request_at_tz_hour"] = tz_time["hour"] - data["request_at_tz_minute"] = tz_time["min"] + data["request_at_tz_year"] = string.sub(tz_time, 1, 4) .. "-01-01" -- YYYY-01-01 + data["request_at_tz_month"] = string.sub(tz_time, 1, 7) .. "-01" -- YYYY-MM-01 + data["request_at_tz_week"] = tz_week:strftime("%Y-%m-%d") -- YYYY-MM-DD of first day in ISO week. + data["request_at_tz_date"] = string.sub(tz_time, 1, 10) -- YYYY-MM-DD + data["request_at_tz_hour"] = string.sub(tz_time, 1, 13) .. ":00:00" -- YYYY-MM-DD HH:00:00 + data["request_at_tz_minute"] = tz_time -- YYYY-MM-DD HH:MM:00 -- Check for log data set by the separate api backend proxy -- (log_api_backend_proxy.lua). This is used for timing information. diff --git a/src/api-umbrella/web-app/app/models/log_search/sql.rb b/src/api-umbrella/web-app/app/models/log_search/sql.rb index 705c8f8ea..2797f8a95 100644 --- a/src/api-umbrella/web-app/app/models/log_search/sql.rb +++ b/src/api-umbrella/web-app/app/models/log_search/sql.rb @@ -30,17 +30,17 @@ def initialize(options = {}) when "minute" raise "TODO" when "hour" - @interval_field = "CAST(request_at_tz_date AS CHAR(10)) || '-' || CAST(request_at_tz_hour AS CHAR(2))" - @interval_field_format = "%Y-%m-%d-%H" + @interval_field = "request_at_tz_hour" + @interval_field_format = "%Y-%m-%d %H:%M:%S" when "day" @interval_field = "request_at_tz_date" @interval_field_format = "%Y-%m-%d" when "week" - @interval_field = "CAST(request_at_tz_year AS CHAR(4)) || '-' || CAST(request_at_tz_week AS CHAR(2))" - @interval_field_format = "%G-%V" + @interval_field = "request_at_tz_week" + @interval_field_format = "%Y-%m-%d" when "month" - @interval_field = "CAST(request_at_tz_year AS CHAR(4)) || '-' || CAST(request_at_tz_month AS CHAR(2))" - @interval_field_format = "%Y-%m" + @interval_field = "request_at_tz_month" + @interval_field_format = "%Y-%m-%d" end @query = {