-
Notifications
You must be signed in to change notification settings - Fork 351
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DRAFT: MEGAsync parser #4192
base: main
Are you sure you want to change the base?
DRAFT: MEGAsync parser #4192
Changes from all commits
34db3fb
f1ee593
ce08a82
29a9c9c
ce559c0
516657a
ca0771c
61a7bd2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
# -*- coding: utf-8 -*- | ||
"""Parser for MEGASync log files. | ||
""" | ||
from dfdatetime import time_elements as dfdatetime_time_elements | ||
|
||
import pyparsing | ||
|
||
from plaso.containers import events | ||
from plaso.containers import time_events | ||
from plaso.lib import definitions | ||
from plaso.parsers import logger | ||
from plaso.parsers import manager | ||
from plaso.parsers import text_parser | ||
|
||
|
||
class MEGASyncEventData(events.EventData): | ||
"""MEGASync log event data. | ||
|
||
Attributes: | ||
log_level (str): log level. | ||
message (str): message. | ||
""" | ||
|
||
DATA_TYPE = 'megasync:log:line' | ||
|
||
def __init__(self): | ||
"""Initializes event data.""" | ||
super(MEGASyncEventData, self).__init__(data_type=self.DATA_TYPE) | ||
self.log_level = None | ||
self.message = None | ||
|
||
class MEGASyncParser(text_parser.PyparsingSingleLineTextParser): | ||
"""Parses MEGASync log files""" | ||
|
||
NAME = 'megasync' | ||
DATA_FORMAT = 'MEGASync log file' | ||
|
||
# Some types of MEGASync log lines can be very long. | ||
MAX_LINE_LENGTH = 65536 | ||
|
||
_ENCODING = 'utf-8' | ||
|
||
_TWO_DIGITS = text_parser.PyparsingConstants.TWO_DIGITS | ||
|
||
# Timestamp format is: mm/dd-hh:mm:ss.###### | ||
# For example: 03/21-04:13:44.621454 | ||
_TIMESTAMP = pyparsing.Group( | ||
_TWO_DIGITS.setResultsName('month') + pyparsing.Suppress('/') + | ||
_TWO_DIGITS.setResultsName('day') + pyparsing.Suppress('-') + | ||
text_parser.PyparsingConstants.TIME_MSEC_ELEMENTS | ||
).setResultsName('timestamp') | ||
|
||
_THREAD_NAME = pyparsing.Word(pyparsing.printables) | ||
|
||
_LOG_LEVEL = ( | ||
pyparsing.Literal('DBG') | | ||
pyparsing.Literal('INFO') | | ||
pyparsing.Literal('WARN') | | ||
pyparsing.Literal('DTL') | | ||
pyparsing.Literal('ERR') | | ||
pyparsing.Literal('CRIT')).setResultsName('log_level') | ||
|
||
_MESSAGE = (pyparsing.White(' ' ,min=1,max=2).suppress() + | ||
pyparsing.restOfLine().setResultsName('message')) | ||
|
||
_LOG_LINE = _TIMESTAMP + _THREAD_NAME + _LOG_LEVEL + _MESSAGE | ||
|
||
# Indicates that the last log line was repeated multiple times. | ||
_REPEAT_LINE = ( | ||
pyparsing.Suppress('[repeated x') + | ||
text_parser.PyparsingConstants.INTEGER + | ||
pyparsing.Suppress("]") | ||
).setResultsName("repeats") | ||
|
||
_PROGRAM_START = pyparsing.Literal( | ||
'-----------------------------' \ | ||
' program start ' \ | ||
'-----------------------------') | ||
|
||
LINE_STRUCTURES = [ | ||
('line', _LOG_LINE), | ||
('repeat', _REPEAT_LINE), | ||
('program_start', _PROGRAM_START)] | ||
|
||
_LINES_OF_INTEREST = [ | ||
'Transfer (UPLOAD) finished', | ||
'Transfer (UPLOAD) starting', | ||
'Upload complete', | ||
'Creating thumb/preview' | ||
] | ||
|
||
def __init__(self): | ||
"""Initializes a parser.""" | ||
super(MEGASyncParser, self).__init__() | ||
self._last_month = 0 | ||
self._maximum_year = 0 | ||
self._year_use = 0 | ||
|
||
def _UpdateYear(self, mediator, month): | ||
"""Updates the year to use for events, based on last observed month. | ||
|
||
Args: | ||
parser_mediator (ParserMediator): mediates interactions between parsers | ||
and other components, such as storage and dfvfs. | ||
month (int): month observed by the parser, where January is 1. | ||
""" | ||
# TODO: Investigate using timestamps of the Gzip file, not the file | ||
# within the Gzip file as the basis of estimation. | ||
if not self._year_use: | ||
self._year_use = mediator.GetEstimatedYear() | ||
|
||
# zlib (used by MEGASync to compress rotated-out log files) | ||
# can generate Gzip files with empty modification timestamp. | ||
# This shouldn't be used as the estimated year. | ||
# MEGASync logs can't originate from 1970, so this is safe. | ||
if self._year_use == 1970: | ||
self._year_use = mediator.GetCurrentYear() | ||
if not self._maximum_year: | ||
self._maximum_year = mediator.GetLatestYear() | ||
|
||
if not self._last_month: | ||
self._last_month = month | ||
return | ||
|
||
if self._last_month > month: | ||
if self._year_use < self._maximum_year: | ||
self._year_use += 1 | ||
self._last_month = month | ||
|
||
|
||
def ParseRecord(self, parser_mediator, key, structure): | ||
"""Parses a structure of tokens derived from a line of a text file. | ||
|
||
Args: | ||
parser_mediator (ParserMediator): mediates interactions between parsers | ||
and other components, such as storage and dfvfs. | ||
key (str): name of the parsed structure. | ||
structure (pyparsing.ParseResults): structure of tokens derived from | ||
a line of a text file. | ||
|
||
Raises: | ||
ParseError: when the structure type is unknown. | ||
""" | ||
if key == 'line': | ||
time_elements_tuple = self._GetValueFromStructure(structure, 'timestamp') | ||
month, day_of_month, hours, minutes, seconds, microseconds = ( | ||
time_elements_tuple) | ||
|
||
self._UpdateYear(parser_mediator, month) | ||
|
||
time_elements_tuple = ( | ||
self._year_use, | ||
month, day_of_month, hours, minutes, seconds, microseconds) | ||
|
||
try: | ||
timestamp = dfdatetime_time_elements.TimeElementsInMicroseconds( | ||
time_elements_tuple=time_elements_tuple) | ||
except ValueError: | ||
parser_mediator.ProduceExtractionWarning( | ||
'invalid timestamp: {0!s}'.format(time_elements_tuple) | ||
) | ||
return | ||
|
||
log_message = self._GetValueFromStructure(structure, 'message') | ||
|
||
for line in self._LINES_OF_INTEREST: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. MEGASync logs are very verbose, so I currently only produce events for log lines that may be of forensic interest. If it's more desirable to produce events for every log entry, I can change this, but I recommend checking out the sample test file for context. (also here if the diff viewer wont show it) Note that in an environment where MEGAsync ran for more than a few minutes, you will likely have 50 of those log files, totaling some ~4 million lines. |
||
if log_message.startswith(line): | ||
event_data = MEGASyncEventData() | ||
event_data.message = log_message | ||
event_data.log_level = self._GetValueFromStructure( | ||
structure, 'log_level') | ||
event = time_events.DateTimeValuesEvent( | ||
timestamp, definitions.TIME_DESCRIPTION_RECORDED) | ||
parser_mediator.ProduceEventWithEventData(event, event_data) | ||
break | ||
|
||
def VerifyStructure(self, parser_mediator, line): | ||
"""Verifies if a line from a text file is in the expected format. | ||
|
||
Args: | ||
parser_mediator (ParserMediator): mediates interactions between parsers | ||
and other components, such as storage and dfvfs. | ||
line (str): line from a text file. | ||
|
||
Returns: | ||
bool: True if the line is in the expected format, False if not. | ||
""" | ||
|
||
verified = False | ||
for _, line_structure in self.LINE_STRUCTURES: | ||
try: | ||
_ = line_structure.parseString(line) | ||
except pyparsing.ParseException: | ||
continue | ||
verified = True | ||
break | ||
if not verified: | ||
logger.debug('Not a MEGASync log file') | ||
|
||
return verified | ||
|
||
|
||
manager.ParsersManager.RegisterParser(MEGASyncParser) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
MEGASync compresses rotated-out log files using the gzip format. the
mtime
timestamp in the gzip stream, however, is emptymaking this approach not work. (and currently I just set the estimate to the current year in case of epoch timestamps)
I'm struggling a bit to find a good way to estimate the year of the log file in plaso, other than relying on
--preferred-year
.I've considered some things like accessing the timestamps of the compressed file itself (should be accurate except for some edge cases, but also don't see that the APIs would allow for this currently).