From 5d570f83a5d269990482259bd9beaa815d36bfa4 Mon Sep 17 00:00:00 2001 From: ducky-hong Date: Mon, 16 May 2016 01:27:46 +0900 Subject: [PATCH] [#22] Change parsing engine from regex to PEG (parboiled) parboiled is PEG (Parsing Expression Grammar) implementation. PEG is more concise than regex, and regex could not handle recursive structure well. GcEventNode is added to access the parsed data easily. Its concrete class is generated by Auto Value library which utilizes annotation processing. IDE setup is required and the instruction can be found here: https://github.com/google/auto/issues/106 --- common/src/main/proto/gc_model.proto | 2 +- parser/.gitignore | 2 + parser/build.gradle | 16 +- .../edu/kaist/algo/parser/CmsGcLogRule.java | 196 ++++++++++++++ .../edu/kaist/algo/parser/CmsLogParser.java | 243 +++++------------- .../edu/kaist/algo/parser/CmsLogPatterns.java | 30 --- .../edu/kaist/algo/parser/GcEventNode.java | 69 +++++ .../kaist/algo/parser/CmsLogParserTest.java | 31 ++- 8 files changed, 375 insertions(+), 214 deletions(-) create mode 100644 parser/.gitignore create mode 100644 parser/src/main/java/edu/kaist/algo/parser/CmsGcLogRule.java delete mode 100644 parser/src/main/java/edu/kaist/algo/parser/CmsLogPatterns.java create mode 100644 parser/src/main/java/edu/kaist/algo/parser/GcEventNode.java diff --git a/common/src/main/proto/gc_model.proto b/common/src/main/proto/gc_model.proto index c7cf26c..c6393ba 100644 --- a/common/src/main/proto/gc_model.proto +++ b/common/src/main/proto/gc_model.proto @@ -21,4 +21,4 @@ message GcEvent { double sys_time = 7; // time spent in OS call or waiting for system event double real_time = 8; // (user_time + sys_tim) / threads# + alpha double ref_time = 9; // reference processing time -} \ No newline at end of file +} diff --git a/parser/.gitignore b/parser/.gitignore new file mode 100644 index 0000000..3e83e4a --- /dev/null +++ b/parser/.gitignore @@ -0,0 +1,2 @@ +src/generated +src/generated_tests diff --git a/parser/build.gradle b/parser/build.gradle index dca15b4..c51fb6f 100644 --- a/parser/build.gradle +++ b/parser/build.gradle @@ -1,4 +1,14 @@ +apply plugin: 'idea' + dependencies { - compile project(':common') - compile 'org.apache.commons:commons-lang3:3.4' -} \ No newline at end of file + compile project(':common') + compile 'org.apache.commons:commons-lang3:3.4' + compile 'org.parboiled:parboiled-java:1.1.7' + compile 'com.google.auto.value:auto-value:1.2' +} + +idea { + module { + sourceDirs += file("${projectDir}/src/generated") + } +} diff --git a/parser/src/main/java/edu/kaist/algo/parser/CmsGcLogRule.java b/parser/src/main/java/edu/kaist/algo/parser/CmsGcLogRule.java new file mode 100644 index 0000000..6ba3192 --- /dev/null +++ b/parser/src/main/java/edu/kaist/algo/parser/CmsGcLogRule.java @@ -0,0 +1,196 @@ +package edu.kaist.algo.parser; + +import org.apache.commons.lang3.StringUtils; +import org.parboiled.BaseParser; +import org.parboiled.Rule; +import org.parboiled.annotations.BuildParseTree; +import org.parboiled.annotations.Label; +import org.parboiled.annotations.SuppressSubnodes; + +/** + * PEG (Parsing Expression Grammar) for the CMS GC log. + * + *

This PEG is for parsing the one line of CMS GC logs. The line should be complete. That is, + * it should not be cut off by another thread's interference. + * + *

Beware: CMS-related logs are not supported yet. + * + *

Following options are required: + *

+ * + *

PEG (whitespaces are ignored for conciseness): + *

+ * InputLine <- Event UserSysRealTimes
+ * Event <- (Time ': ')? '[' TypeAndDetail (Event)* UsageAndElapsedTime ']'
+ * Time <- Digits '.' Digits ' secs'?
+ * Digits <- [0-9]+
+ * TypeAndDetail <- Type ('(' Detail ')')? ': '?
+ * Type <- 'GC' / 'ParNew' / 'CMS' / 'Full GC' / 'Metaspace' / '1 CMS-initial-mark'
+ *       / 'YG occupancy' / 'Rescan (parallel)' / 'weak refs processing' / 'class unloading'
+ *       / 'scrub symbol table' / 'scrub string table' / '1 CMS-remark'
+ * Detail <- 'System.gc()' / !')'+
+ * UsageAndElapsedTime <- UsageChange? (', ' Event)? (', ' Time)?
+ * UsageChange <- (Size '-&gt;')? UsageWithTotal
+ * UsageWithTotal <- Size '(' Size ')'
+ * Size <- Digits 'K '
+ * UserSysRealTimes <- '[ Times: user=' Time ' sys=' Time ', real=' Time ']'
+ * 
+ */ +@BuildParseTree +public class CmsGcLogRule extends BaseParser { + + Rule InputLine() { + return Sequence( + push(GcEventNode.builder()), + Event(), + UserSysRealTimes(), + push(popAsNode().build()) + ); + } + + Rule Event() { + return Sequence( + Optional( + TimeLong(), ": ", + swap() && push(popAsNode().timestamp(popAsLong())) + ), + "[", TypeAndDetail(), " ", + ZeroOrMore( + push(GcEventNode.builder()), + Event(), + swap() && push(popAsNode().addChild(popAsNode().build())) + ), + " ", UsageAndElapsedTime(), "] " + ); + } + + Rule TypeAndDetail() { + return Sequence( + Type(), + push(popAsNode().type(match())), + Optional(" ", "(", Detail(), push(popAsNode().detail(match())), ")"), + Optional(": ") + ); + } + + @SuppressSubnodes + Rule Type() { + return FirstOf("GC", "ParNew", "CMS", "Full GC", "Metaspace", "1 CMS-initial-mark", + "YG occupancy", "Rescan (parallel)", "weak refs processing", "class unloading", + "scrub symbol table", "scrub string table", "1 CMS-remark"); + } + + @SuppressSubnodes + Rule Detail() { + return FirstOf("System.gc()", OneOrMore(NoneOf(")"))); + } + + Rule UsageAndElapsedTime() { + return Sequence( + Optional(UsageChange()), + Optional(", ", + push(GcEventNode.builder()), + Event(), // Metaspace + swap() && push(popAsNode().addChild(popAsNode().build())) + ), + Optional(", ", + TimeDouble(), + swap() && push(popAsNode().elapsedTime(popAsDouble())) + ) + ); + } + + Rule UsageChange() { + return Sequence( + Optional( + Size(), "->", + swap() && push(popAsNode().prevUsage(popAsLong())) + ), + UsageWithTotal() + ); + } + + Rule UsageWithTotal() { + return Sequence( + Size(), + "(", Size(), ")", + swap3() && push(popAsNode().afterUsage(popAsLong())), + push(popAsNode().capacity(popAsLong())) + ); + } + + Rule Size() { + return Sequence( + Digits(), + push(Long.valueOf(match())), + WhiteSpace(), "K " + ); + } + + Rule UserSysRealTimes() { + return Sequence( + "[", "Times: ", "user=", TimeDouble(), " sys=", TimeDouble(), ", real=", TimeDouble(), "]", + swap4() && push(popAsNode().user(popAsDouble())), + push(popAsNode().sys(popAsDouble())), + push(popAsNode().real(popAsDouble())) + ); + } + + @Label("Time") + @SuppressSubnodes + Rule TimeDouble() { + return Sequence( + Sequence(Digits(), ".", Digits()), + push(Double.valueOf(match())), + Optional(" secs") + ); + } + + @Label("Time") + @SuppressSubnodes + Rule TimeLong() { + return Sequence( + Sequence(Digits(), ".", Digits()), + push(Long.valueOf(StringUtils.remove(match(), "."))) + ); + } + + @SuppressSubnodes + Rule Digits() { + return OneOrMore(Digit()); + } + + Rule Digit() { + return CharRange('0', '9'); + } + + @SuppressSubnodes + Rule WhiteSpace() { + return ZeroOrMore(AnyOf(" \t\f")); + } + + @Override + protected Rule fromStringLiteral(String string) { + return string.endsWith(" ") + ? Sequence(String(string.substring(0, string.length() - 1)), WhiteSpace()) + : String(string); + } + + protected Double popAsDouble() { + return (Double) pop(); + } + + protected GcEventNode.Builder popAsNode() { + return (GcEventNode.Builder) pop(); + } + + protected Long popAsLong() { + return (Long) pop(); + } +} diff --git a/parser/src/main/java/edu/kaist/algo/parser/CmsLogParser.java b/parser/src/main/java/edu/kaist/algo/parser/CmsLogParser.java index fae5f2b..be3ad8f 100644 --- a/parser/src/main/java/edu/kaist/algo/parser/CmsLogParser.java +++ b/parser/src/main/java/edu/kaist/algo/parser/CmsLogParser.java @@ -1,8 +1,14 @@ package edu.kaist.algo.parser; +import com.google.common.collect.ImmutableList; + import edu.kaist.algo.model.GcEvent; import org.apache.commons.lang3.StringUtils; +import org.parboiled.Parboiled; +import org.parboiled.parserunners.BasicParseRunner; +import org.parboiled.parserunners.ParseRunner; +import org.parboiled.support.ParsingResult; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -15,28 +21,38 @@ import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Optional; import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.Stream; /** * The CmsLogParser is responsible to parse the CMS GC log file. - * - * @author ducky */ public class CmsLogParser { private static final Logger logger = LoggerFactory.getLogger(CmsLogParser.class); + private static final Pattern WRITER_THREAD = Pattern.compile(""); + private static final Pattern MULTI_LINE = Pattern.compile(".*GC.*\\[CMS$"); + private int currentThread; - private Map threadToIncompleteLine = new HashMap<>(); + private final Map threadToIncompleteLine = new HashMap<>(); + private final ParseRunner parseRunner; + + public CmsLogParser() { + final CmsGcLogRule parser = Parboiled.createParser(CmsGcLogRule.class); + parseRunner = new BasicParseRunner<>(parser.InputLine()); + } /** * Parses the given log file to the list of GcEvent. + * * @param path path of the log file * @return list of GcEvent */ - public List parse(Path path) { + public List parse(final Path path) { try (BufferedReader bufferedReader = Files.newBufferedReader(path)) { return parse(bufferedReader.lines()); } catch (IOException ioe) { @@ -47,21 +63,23 @@ public List parse(Path path) { /** * Parses the given logs to the list of GcEvent. + * * @param logs stream of gc log lines * @return list of GcEvent */ - public List parse(Stream logs) { - return logs.map(this::parseLine).filter(Objects::nonNull).collect(Collectors.toList()); + public List parse(final Stream logs) { + return logs.map(this::parseLine).filter(Objects::nonNull) + .collect(Collectors.collectingAndThen(Collectors.toList(), Collections::unmodifiableList)); } private GcEvent parseLine(String line) { if (line.startsWith(" result = parseRunner.run(line); + if (result.matched) { + final GcEventNode node = (GcEventNode) result.resultValue; + final GcEvent.LogType logType = convertLogType(node); + final GcEvent.Builder builder = GcEvent.newBuilder() + .setThread(currentThread) + .setTimestamp(node.timestamp()) + .setLogType(logType) + .setPauseTime(node.elapsedTime()) + .setUserTime(node.user()) + .setSysTime(node.sys()) + .setRealTime(node.real()); + if (logType == GcEvent.LogType.CMS_FINAL_REMARK) { + final Optional weakRefsProcessingEvent = node.children().stream() + .filter(e -> StringUtils.equals(e.type(), "weak refs processing")) + .findFirst(); + if (weakRefsProcessingEvent.isPresent()) { + builder.setRefTime(weakRefsProcessingEvent.get().elapsedTime()); } } + return builder.build(); } return null; } - private static class UserSysRealTime { - public final double user; - public final double sys; - public final double real; - - private UserSysRealTime(double user, double sys, double real) { - this.user = user; - this.sys = sys; - this.real = real; - } - - public static UserSysRealTime of(double user, double sys, double real) { - return new UserSysRealTime(user, sys, real); + private GcEvent.LogType convertLogType(final GcEventNode node) { + if (StringUtils.equals(node.type(), "Full GC")) { + return GcEvent.LogType.FULL_GC; + } else if (StringUtils.equals(node.type(), "GC")) { + if (StringUtils.equals(node.detail(), "CMS Initial Mark")) { + return GcEvent.LogType.CMS_INIT_MARK; + } else if (StringUtils.equals(node.detail(), "CMS Final Remark")) { + return GcEvent.LogType.CMS_FINAL_REMARK; + } else { + final List types = node.children().stream() + .map(GcEventNode::type).collect(Collectors.toList()); + final boolean isFullGc = Collections.indexOfSubList( + types, ImmutableList.of("ParNew", "CMS")) != -1; + if (isFullGc) { + return GcEvent.LogType.FULL_GC; + } else { + return GcEvent.LogType.MINOR_GC; + } + } } + throw new IllegalArgumentException("Log type must be specified. Check the log."); } } diff --git a/parser/src/main/java/edu/kaist/algo/parser/CmsLogPatterns.java b/parser/src/main/java/edu/kaist/algo/parser/CmsLogPatterns.java deleted file mode 100644 index e83ab8f..0000000 --- a/parser/src/main/java/edu/kaist/algo/parser/CmsLogPatterns.java +++ /dev/null @@ -1,30 +0,0 @@ -package edu.kaist.algo.parser; - -import java.util.regex.Pattern; - -/** - * Patterns for CMS log format. - */ -public class CmsLogPatterns { - private CmsLogPatterns() { - // Do not instantiate. - } - - static final Pattern WRITER_THREAD = Pattern.compile(""); - static final Pattern MULTI_LINE = Pattern.compile(".*GC.*\\[CMS$"); - static final Pattern USER_SYS_REAL_EXTRACT = Pattern.compile("user=(.+) sys=(.+), real=(.+)"); - static final Pattern FULL_GC_START = Pattern.compile("^(\\d+.\\d{3}): \\[Full GC(?: \\(([^\\)]+)\\))?"); - static final Pattern CMS_FULL_GC = Pattern.compile( "\\[CMS(?: \\(([^\\)]+)\\))?: (\\d+)K->(\\d+)K\\((\\d+)K\\), (\\d+.\\d+) secs\\] (\\d+)K->(\\d+)K\\((\\d+)K\\),[^,]+, (\\d+.\\d+) secs\\]"); - static final Pattern METASPACE = Pattern.compile("\\[Metaspace: (\\d+)K->(\\d+)K\\((\\d+)K\\)\\]"); - static final Pattern USER_SYS_REAL_TIME = Pattern.compile("\\[Times: (.+) secs\\]"); - static final Pattern GC = Pattern.compile("^(\\d+.\\d{3}): \\[GC \\(([^\\)]+)\\).*\\[Times: (.+) secs\\].*$"); - static final Pattern PAR_NEW = Pattern.compile( "\\[ParNew: (\\d+)K->(\\d+)K\\((\\d+)K\\), (\\d+.\\d+) secs\\] (\\d+)K->(\\d+)K\\((\\d+)K\\), (\\d+.\\d+) secs\\]"); - static final Pattern PAR_NEW_TRIGGERD_FULL_GC = Pattern.compile( "\\[ParNew(?: \\(([^\\)]+)\\))?: (\\d+)K->(\\d+)K\\((\\d+)K\\), (\\d+.\\d+) secs\\]\\d+.\\d+: \\[CMS(?: \\(([^\\)]+)\\))?: (\\d+)K->(\\d+)K\\((\\d+)K\\), (\\d+.\\d+) secs\\] (\\d+)K->(\\d+)K\\((\\d+)K\\), \\[Metaspace: (\\d+)K->(\\d+)K\\((\\d+)K\\)\\], (\\d+.\\d+) secs\\]"); - static final Pattern CMS_INITIAL_MARK = Pattern.compile( "\\[1 CMS-initial-mark: (\\d+)K\\((\\d+)K\\)\\] (\\d+)K\\((\\d+)K\\), (\\d+.\\d+) secs\\]"); - static final Pattern CMS_REMARK = Pattern.compile( "\\[1 CMS-remark: (\\d+)K\\((\\d+)K\\)\\] (\\d+)K\\((\\d+)K\\), (\\d+.\\d+) secs\\]"); - static final Pattern YG_OCCUPANCY = Pattern.compile("\\[YG occupancy: (\\d+) K \\((\\d+) K\\)\\]"); - static final Pattern WEAK_REFS_PROCESSING = Pattern.compile("\\[weak refs processing, (\\d+.\\d+) secs\\]"); - static final Pattern CLASS_UNLOADING = Pattern.compile("\\[class unloading, (\\d+.\\d+) secs\\]"); - static final Pattern SCRUB_SYMBOL_TABLE = Pattern.compile("\\[scrub symbol table, (\\d+.\\d+) secs\\]"); - static final Pattern SCRUB_STRING_TABLE = Pattern.compile("\\[scrub string table, (\\d+.\\d+) secs\\]"); -} diff --git a/parser/src/main/java/edu/kaist/algo/parser/GcEventNode.java b/parser/src/main/java/edu/kaist/algo/parser/GcEventNode.java new file mode 100644 index 0000000..9ff2c5a --- /dev/null +++ b/parser/src/main/java/edu/kaist/algo/parser/GcEventNode.java @@ -0,0 +1,69 @@ +package edu.kaist.algo.parser; + +import com.google.auto.value.AutoValue; +import com.google.common.collect.ImmutableList; + +import javax.annotation.Nullable; + +@AutoValue +public abstract class GcEventNode { + + @Nullable public abstract Long timestamp(); + + public abstract String type(); + + @Nullable public abstract String detail(); + + @Nullable public abstract Long prevUsage(); + + @Nullable public abstract Long afterUsage(); + + @Nullable public abstract Long capacity(); + + @Nullable public abstract Double elapsedTime(); + + @Nullable public abstract Double user(); + + @Nullable public abstract Double sys(); + + @Nullable public abstract Double real(); + + public abstract ImmutableList children(); + + public static Builder builder() { + return new AutoValue_GcEventNode.Builder(); + } + + @AutoValue.Builder + public abstract static class Builder { + + public abstract Builder timestamp(Long timestamp); + + public abstract Builder type(String type); + + public abstract Builder detail(String detail); + + public abstract Builder prevUsage(Long prevUsage); + + public abstract Builder afterUsage(Long afterUsage); + + public abstract Builder capacity(Long capacity); + + public abstract Builder elapsedTime(Double elapsedTime); + + public abstract Builder user(Double user); + + public abstract Builder sys(Double sys); + + public abstract Builder real(Double real); + + abstract ImmutableList.Builder childrenBuilder(); + + public Builder addChild(GcEventNode child) { + childrenBuilder().add(child); + return this; + } + + public abstract GcEventNode build(); + } +} diff --git a/parser/src/test/java/edu/kaist/algo/parser/CmsLogParserTest.java b/parser/src/test/java/edu/kaist/algo/parser/CmsLogParserTest.java index ebc81f6..bf128ef 100644 --- a/parser/src/test/java/edu/kaist/algo/parser/CmsLogParserTest.java +++ b/parser/src/test/java/edu/kaist/algo/parser/CmsLogParserTest.java @@ -32,7 +32,7 @@ public void testParseMinorGc() throws Exception { } @Test - public void testParseFullGc() throws Exception { + public void testParseFullGcSystem() throws Exception { final String log = "\n" + "111.350: [GC (Allocation Failure) 111.351: [ParNew: 30720K->3392K(30720K), 0.0095220 secs] 75542K->50511K(99008K), 0.0097545 secs] [Times: user=0.02 sys=0.00, real=0.01 secs]\n" + "111.499: [GC (Allocation Failure) 111.499: [ParNew: 30720K->3391K(30720K), 0.0105871 secs] 77839K->53229K(99008K), 0.0108695 secs] [Times: user=0.02 sys=0.00, real=0.01 secs]\n" @@ -43,7 +43,7 @@ public void testParseFullGc() throws Exception { } @Test - public void testParseFullGcTriggeredByParNew() throws Exception { + public void testParseConcurrentModeFailure() throws Exception { final String log = "\n" + "126.426: [GC (Allocation Failure) 126.426: [ParNew: 30720K->3392K(30720K), 0.0128764 secs] 83156K->58798K(99008K), 0.0131301 secs] [Times: user=0.02 sys=0.00, real=0.01 secs]\n" + "126.487: [GC (Allocation Failure) 126.487: [ParNew: 30720K->3392K(30720K), 0.0110704 secs] 86126K->61977K(99008K), 0.0113176 secs] [Times: user=0.02 sys=0.00, real=0.01 secs]\n" @@ -61,6 +61,31 @@ public void testParseFullGcTriggeredByParNew() throws Exception { assertGcEvent(result.get(5), GcEvent.LogType.FULL_GC, 11779, 126743, 0.2280550, 0.23, 0.00, 0.22); } + @Test + public void testParseFullGc() throws Exception { + final String log = "\n" + + "126.743: [GC (Allocation Failure) 126.743: [ParNew: 30719K->30719K(30720K), 0.0000574 secs]126.743: [CMS" + + ": 66466K->45817K(68288K), 0.2277434 secs] 97186K->45817K(99008K), [Metaspace: 60953K->60953K(1105920K)], 0.2280550 secs] [Times: user=0.23 sys=0.00, real=0.22 secs]\n" + + "127.029: [Full GC 127.029: [CMS: 49837K->38462K(68288K), 0.1817762 secs] 65053K->38462K(99008K), [Metaspace: 60946K->60946K(1105920K)], 0.1819922 secs] [Times: user=0.18 sys=0.00, real=0.18 secs]"; + final List result = parser.parse(Stream.of(log.split("\n"))); + assertEquals(2, result.size()); // Do not count CMS-concurrent-mark yet. + assertGcEvent(result.get(0), GcEvent.LogType.FULL_GC, 11779, 126743, 0.2280550, 0.23, 0.00, 0.22); + assertGcEvent(result.get(1), GcEvent.LogType.FULL_GC, 11779, 127029, 0.1819922, 0.18, 0.00, 0.18); + } + + @Test + public void testParseFullGcConcurrentModeFailure() throws Exception { + final String log = "\n" + + "55.780: [Full GC (Allocation Failure) 55.780: [CMS\n" + + "\n" + + "55.799: [CMS-concurrent-mark: 0.113/0.158 secs] [Times: user=0.66 sys=0.08, real=0.15 secs]\n" + + "\n" + + " (concurrent mode failure): 64750K->45276K(68288K), 0.1975301 secs] 95470K->45276K(99008K), [Metaspace: 61093K->61093K(1107968K)], 0.1979402 secs] [Times: user=0.19 sys=0.00, real=0.20 secs]"; + final List result = parser.parse(Stream.of(log.split("\n"))); + assertEquals(1, result.size()); // Do not count CMS-concurrent-mark yet. + assertGcEvent(result.get(0), GcEvent.LogType.FULL_GC, 11779, 55780, 0.1979402, 0.19, 0.00, 0.20); + } + @Test public void testParseInitialMark() throws Exception { final String log = "\n" @@ -103,4 +128,4 @@ private static void assertGcEvent(GcEvent event, GcEvent.LogType logType, int th assertEquals(sys, event.getSysTime(), 0.0001); assertEquals(real, event.getRealTime(), 0.0001); } -} \ No newline at end of file +}