Skip to content

Commit

Permalink
Change structure of how we store the various analytics time fields.
Browse files Browse the repository at this point in the history
Instead of storing the individual components, we're now storing a full
date or timestamp string, giving the starting date of that block. So,
for example, for 2016-03-23, the request_at_tz_year would be 2016-01-01,
the request_at_tz_month would be 2016-03-01, etc.

While the previous storage of the individual components was more
efficient (from a space perspective), and less duplicative, the reason
for this switch is for a couple reasons:

- In order to query the previous individual fields, we had to
  concatenate them at query time. Skipping concatenation actually gives
  us a decent performance boost when the queries have to fallback to
  Presto (in an example query, it was consistently taking 60 seconds
  with concating, vs 45 seconds without concating).
- The bigger stumbling block we ran into was that Presto doesn't support
  CASTing to CHARs. This meant various queries couldn't fallback to
  Presto. We could cast to VARCHAR in Presto, but then the sorting the
  dates wouldn't work properly, since the integers weren't zero-padded
  (so 12 would come before 2).
- Despite the less efficient storage (although, I don't think it should
  actually make much of a huge difference in the columnar store), this
  approach does greatly simplify querying (since you only ever have to
  GROUP BY a single time field).
  • Loading branch information
GUI committed Mar 23, 2016
1 parent 77ed897 commit 425f546
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ $ sudo -u hive java -Dapiumbrella.page_size=10000 -Dapiumbrella.elasticsearch_ur
$ sudo -u hive hive
hive> CREATE DATABASE api_umbrella;
hive> CREATE EXTERNAL TABLE api_umbrella.logs(request_at BIGINT, id STRING, request_at_tz_offset INT, request_at_tz_hour SMALLINT, request_at_tz_minute SMALLINT, user_id STRING, denied_reason STRING, request_method STRING, request_url_scheme STRING, request_url_host STRING, request_url_port INT, request_url_path STRING, request_url_path_level1 STRING, request_url_path_level2 STRING, request_url_path_level3 STRING, request_url_path_level4 STRING, request_url_path_level5 STRING, request_url_path_level6 STRING, request_url_query STRING, request_ip STRING, request_ip_country STRING, request_ip_region STRING, request_ip_city STRING, request_ip_lat DOUBLE, request_ip_lon DOUBLE, request_user_agent STRING, request_user_agent_type STRING, request_user_agent_family STRING, request_size INT, request_accept STRING, request_accept_encoding STRING, request_content_type STRING, request_connection STRING, request_origin STRING, request_referer STRING, request_basic_auth_username STRING, response_status SMALLINT, response_content_type STRING, response_content_length INT, response_content_encoding STRING, response_transfer_encoding STRING, response_server STRING, response_cache STRING, response_age INT, response_size INT, timer_response DOUBLE, timer_backend_response DOUBLE, timer_internal DOUBLE, timer_proxy_overhead DOUBLE, log_imported BOOLEAN) PARTITIONED BY (request_at_tz_year SMALLINT, request_at_tz_month TINYINT, request_at_tz_week TINYINT, request_at_tz_date DATE) STORED AS ORC LOCATION '/apps/api-umbrella/logs';
hive> CREATE EXTERNAL TABLE api_umbrella.logs(request_at BIGINT, id STRING, request_at_tz_offset INT, request_at_tz_hour STRING, request_at_tz_minute STRING, user_id STRING, denied_reason STRING, request_method STRING, request_url_scheme STRING, request_url_host STRING, request_url_port INT, request_url_path STRING, request_url_path_level1 STRING, request_url_path_level2 STRING, request_url_path_level3 STRING, request_url_path_level4 STRING, request_url_path_level5 STRING, request_url_path_level6 STRING, request_url_query STRING, request_ip STRING, request_ip_country STRING, request_ip_region STRING, request_ip_city STRING, request_ip_lat DOUBLE, request_ip_lon DOUBLE, request_user_agent STRING, request_user_agent_type STRING, request_user_agent_family STRING, request_size INT, request_accept STRING, request_accept_encoding STRING, request_content_type STRING, request_connection STRING, request_origin STRING, request_referer STRING, request_basic_auth_username STRING, response_status SMALLINT, response_content_type STRING, response_content_length INT, response_content_encoding STRING, response_transfer_encoding STRING, response_server STRING, response_cache STRING, response_age INT, response_size INT, timer_response DOUBLE, timer_backend_response DOUBLE, timer_internal DOUBLE, timer_proxy_overhead DOUBLE, log_imported BOOLEAN) PARTITIONED BY (request_at_tz_year DATE, request_at_tz_month DATE, request_at_tz_week DATE, request_at_tz_date DATE) STORED AS ORC LOCATION '/apps/api-umbrella/logs';
hive> exit;
$ sudo -u hive hadoop fs -ls -R /apps/api-umbrella/logs | grep "\.orc$" | grep -o "[^ ]*$" | sort -V | sed -e "s/.*request_at_tz_year=\([0-9]\+\).*request_at_tz_month=\([0-9]\+\).*request_at_tz_week=\([0-9]\+\).*request_at_tz_date=\([0-9\-]\+\).*/ALTER TABLE api_umbrella.logs ADD IF NOT EXISTS PARTITION(request_at_tz_year=\1, request_at_tz_month=\2, request_at_tz_week=\3, request_at_tz_date='\4');/" > /tmp/api_umbrella_load_partitions.sql
$ sudo -u hive hadoop fs -ls -R /apps/api-umbrella/logs | grep "\.orc$" | grep -o "[^ ]*$" | sort -V | sed -e "s/.*request_at_tz_year=\([0-9\-]\+\).*request_at_tz_month=\([0-9\-]\+\).*request_at_tz_week=\([0-9\-]\+\).*request_at_tz_date=\([0-9\-]\+\).*/ALTER TABLE api_umbrella.logs ADD IF NOT EXISTS PARTITION(request_at_tz_year='\1', request_at_tz_month='\2', request_at_tz_week='\3', request_at_tz_date='\4');/" > /tmp/api_umbrella_load_partitions.sql
$ sudo -u hive hive -f /tmp/api_umbrella_load_partitions.sql && rm /tmp/api_umbrella_load_partitions.sql
$ curl 'http://ADMIN:KYLIN@localhost:7070/kylin/api/projects' -H 'Content-Type: application/json;charset=UTF-8' --data-binary '{"name":"api_umbrella","description":""}'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.joda.time.Period;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;
import org.slf4j.Logger;
Expand Down Expand Up @@ -66,7 +67,8 @@ public class DayWorker implements Runnable {
private static WriterOptions orcWriterOptions;
private Writer orcWriter;
DateTimeFormatter dateTimeParser = ISODateTimeFormat.dateTimeParser();
DateTimeFormatter dateTimeFormatter = ISODateTimeFormat.dateTime().withZone(App.TIMEZONE);
DateTimeFormatter dateTimeFormatter =
DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").withZone(App.TIMEZONE);
DateTimeFormatter dateFormatter = ISODateTimeFormat.date().withZone(App.TIMEZONE);

public DayWorker(App app, DateTime date) {
Expand Down Expand Up @@ -193,11 +195,11 @@ private Writer getOrcWriter() throws IOException {
if (this.orcWriter == null) {
String date = dateFormatter.print(dayStartTime);
// Create a new file in /dir/YYYY/MM/WW/YYYY-MM-DD.par
Path path = new Path(
App.HDFS_URI + Paths.get(App.DIR, "request_at_tz_year=" + dayStartTime.toString("YYYY"),
"request_at_tz_month=" + dayStartTime.getMonthOfYear(),
"request_at_tz_week=" + dayStartTime.getWeekOfWeekyear(),
"request_at_tz_date=" + date, date + ".orc"));
Path path = new Path(App.HDFS_URI + Paths.get(App.DIR,
"request_at_tz_year=" + dateFormatter.print(dayStartTime.withDayOfYear(1)),
"request_at_tz_month=" + dateFormatter.print(dayStartTime.withDayOfMonth(1)),
"request_at_tz_week=" + dateFormatter.print(dayStartTime.withDayOfWeek(1)),
"request_at_tz_date=" + date, date + ".orc"));
this.orcWriter = OrcFile.createWriter(path, getOrcWriterOptions());
}

Expand Down Expand Up @@ -266,12 +268,14 @@ private void processHit(JsonObject hit) throws Exception {
DateTime requestAt = this.parseTimestamp(value);
log.put("request_at", requestAt.getMillis());
log.put("request_at_tz_offset", App.TIMEZONE.getOffset(requestAt.getMillis()));
log.put("request_at_tz_year", requestAt.getYear());
log.put("request_at_tz_month", requestAt.getMonthOfYear());
log.put("request_at_tz_week", requestAt.getWeekOfWeekyear());
log.put("request_at_tz_date", this.dateFormatter.print(requestAt));
log.put("request_at_tz_hour", requestAt.getHourOfDay());
log.put("request_at_tz_minute", requestAt.getMinuteOfHour());
log.put("request_at_tz_year", dateFormatter.print(requestAt.withDayOfYear(1)));
log.put("request_at_tz_month", dateFormatter.print(requestAt.withDayOfMonth(1)));
log.put("request_at_tz_week", dateFormatter.print(requestAt.withDayOfWeek(1)));
log.put("request_at_tz_date", dateFormatter.print(requestAt));
log.put("request_at_tz_hour", dateTimeFormatter
.print(requestAt.withMinuteOfHour(0).withSecondOfMinute(0).withMillisOfSecond(0)));
log.put("request_at_tz_minute",
dateTimeFormatter.print(requestAt.withSecondOfMinute(0).withMillisOfSecond(0)));
value = null;
break;
case "request_ip_location":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,27 +16,27 @@
},
{
"name": "request_at_tz_year",
"type": "int"
"type": "string"
},
{
"name": "request_at_tz_month",
"type": "int"
"type": "string"
},
{
"name": "request_at_tz_week",
"type": "int"
"type": "string"
},
{
"name": "request_at_tz_date",
"type": "string"
},
{
"name": "request_at_tz_hour",
"type": "int"
"type": "string"
},
{
"name": "request_at_tz_minute",
"type": "int"
"type": "string"
},
{
"name": "user_id",
Expand Down
25 changes: 18 additions & 7 deletions src/api-umbrella/proxy/hooks/log_initial_proxy.lua
Original file line number Diff line number Diff line change
Expand Up @@ -155,14 +155,25 @@ local function log_request()
local utc_sec = data["request_at"]
local tz_offset = timezone:find_current(utc_sec).gmtoff
local tz_sec = utc_sec + tz_offset
local tz_time = os.date("!*t", tz_sec)
local tz_time = os.date("!%Y-%m-%d %H:%M:00", tz_sec)

-- Determine the first day in the ISO week (the most recent Monday).
local tz_week = luatz.gmtime(tz_sec)
if tz_week.wday == 1 then
tz_week.day = tz_week.day - 6
tz_week:normalize()
elseif tz_week.wday > 2 then
tz_week.day = tz_week.day - tz_week.wday + 2
tz_week:normalize()
end

data["request_at_tz_offset"] = tz_offset * 1000
data["request_at_tz_year"] = tz_time["year"]
data["request_at_tz_month"] = tz_time["month"]
data["request_at_tz_week"] = tonumber(os.date("!%V", tz_sec))
data["request_at_tz_date"] = os.date("!%Y-%m-%d", tz_sec)
data["request_at_tz_hour"] = tz_time["hour"]
data["request_at_tz_minute"] = tz_time["min"]
data["request_at_tz_year"] = string.sub(tz_time, 1, 4) .. "-01-01" -- YYYY-01-01
data["request_at_tz_month"] = string.sub(tz_time, 1, 7) .. "-01" -- YYYY-MM-01
data["request_at_tz_week"] = tz_week:strftime("%Y-%m-%d") -- YYYY-MM-DD of first day in ISO week.
data["request_at_tz_date"] = string.sub(tz_time, 1, 10) -- YYYY-MM-DD
data["request_at_tz_hour"] = string.sub(tz_time, 1, 13) .. ":00:00" -- YYYY-MM-DD HH:00:00
data["request_at_tz_minute"] = tz_time -- YYYY-MM-DD HH:MM:00

-- Check for log data set by the separate api backend proxy
-- (log_api_backend_proxy.lua). This is used for timing information.
Expand Down
12 changes: 6 additions & 6 deletions src/api-umbrella/web-app/app/models/log_search/sql.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,17 @@ def initialize(options = {})
when "minute"
raise "TODO"
when "hour"
@interval_field = "CAST(request_at_tz_date AS CHAR(10)) || '-' || CAST(request_at_tz_hour AS CHAR(2))"
@interval_field_format = "%Y-%m-%d-%H"
@interval_field = "request_at_tz_hour"
@interval_field_format = "%Y-%m-%d %H:%M:%S"
when "day"
@interval_field = "request_at_tz_date"
@interval_field_format = "%Y-%m-%d"
when "week"
@interval_field = "CAST(request_at_tz_year AS CHAR(4)) || '-' || CAST(request_at_tz_week AS CHAR(2))"
@interval_field_format = "%G-%V"
@interval_field = "request_at_tz_week"
@interval_field_format = "%Y-%m-%d"
when "month"
@interval_field = "CAST(request_at_tz_year AS CHAR(4)) || '-' || CAST(request_at_tz_month AS CHAR(2))"
@interval_field_format = "%Y-%m"
@interval_field = "request_at_tz_month"
@interval_field_format = "%Y-%m-%d"
end

@query = {
Expand Down

0 comments on commit 425f546

Please sign in to comment.