diff --git a/config.yaml b/config.yaml index 0358db9..f31a09c 100644 --- a/config.yaml +++ b/config.yaml @@ -84,8 +84,7 @@ # because of a large media file media_timeout: 0 - - ## Even more options (obscure use cases, detailed preferences) ## + ## Advanced behavior settings ## # Replacement string for the characters in dialog names which are potentially # problematic in filenames diff --git a/dumpers/bare/dumper.rb b/dumpers/bare/dumper.rb index bb07ce5..ec825ff 100644 --- a/dumpers/bare/dumper.rb +++ b/dumpers/bare/dumper.rb @@ -1,20 +1,13 @@ -require_relative '../dumper_interface' +require_relative '../single_file_line_dumper' -class BareDumper < DumperInterface - - def start_dialog(dialog) - safe_name = get_safe_name(dialog['print_name']) - outfile = File.join(get_backup_dir, safe_name + '.txt') - @stream = File.open(outfile, 'w') - end +class BareDumper < SingleFileLineDumper def dump_msg(dialog, msg) @stream.puts(msg['text']) if msg['text'] end - def end_dialog(dialog) - @stream.close - @stream = nil + def get_file_extension + '.txt' end end diff --git a/dumpers/daily_file_dumper.rb b/dumpers/daily_file_dumper.rb index e1c1dde..cadbe9c 100644 --- a/dumpers/daily_file_dumper.rb +++ b/dumpers/daily_file_dumper.rb @@ -4,7 +4,7 @@ class DailyFileDumper < DumperInterface - def start_dialog(dialog) + def start_dialog(dialog, progress) @prev_date = nil @output_buf = [] safe_name = get_safe_name(dialog['print_name']) @@ -14,13 +14,15 @@ def start_dialog(dialog) def dump_msg(dialog, msg) date = msg['date'] - return unless date + return false unless date date = Time.at(date).to_date flush(dialog) if date != @prev_date && !@output_buf.empty? @prev_date = date + true end def flush(dialog) + return if @output_buf.empty? || @prev_date.nil? filename = get_filename_for_date(dialog, @prev_date) path = File.join(@output_dir, filename) File.open(path, 'w') do |stream| @@ -31,6 +33,7 @@ def flush(dialog) def end_dialog(dialog) flush(dialog) + nil end def get_filename_for_date(dialog, date) diff --git a/dumpers/dumper_interface.rb b/dumpers/dumper_interface.rb index 4d03af6..64f1521 100644 --- a/dumpers/dumper_interface.rb +++ b/dumpers/dumper_interface.rb @@ -11,7 +11,7 @@ # * lib/util.rb exports some useful helpers. For example get_backup_dir returns # the output basedir path # * The same Dumper instance will be used for all dialogs in a backup session -# * $config contains a hash of options parsed from config.json5 +# * $config contains a hash of options parsed from the configuration file # (which is possibly interesting for custom dumper-specific options) class DumperInterface @@ -23,20 +23,31 @@ def start_backup end # Will be called just before dumping a dialog's messages - def start_dialog(dialog) - # dialog: Hash + def start_dialog(dialog, progress) + # dialog: Hash, progress: DumpProgress nil end + # Will be called before each message to dump, to determine whether it is new + # enough to back up + # This default makes sense in simple cases, override for advanced custom logic + def msg_fresh?(msg, progress) + # msg: Hash, progress: DumpProgress + !progress.last_id || msg['id'] > progress.last_id + end + # Will be called for each message to dump (from newest to oldest) # See the python binding documentation to get an idea of the msg attributes: # https://github.com/vysheng/tg/blob/master/README-PY.md#attributes-1 + # Returning boolean false causes an early abort (skips to the next dialog) def dump_msg(dialog, msg) # dialog, msg: Hash raise 'dump_msg must be implemented' end # Will be called just after dumping a dialog's messages + # Optionally return a hash with state information that will be saved as + # custom progress data # dialog: Hash def end_dialog(dialog) # dialog: Hash diff --git a/dumpers/json/dumper.rb b/dumpers/json/dumper.rb index 9ab059b..f259e4f 100644 --- a/dumpers/json/dumper.rb +++ b/dumpers/json/dumper.rb @@ -1,21 +1,14 @@ require 'json' -require_relative '../dumper_interface' +require_relative '../single_file_line_dumper' -class JsonDumper < DumperInterface - - def start_dialog(dialog) - safe_name = get_safe_name(dialog['print_name']) - outfile = File.join(get_backup_dir, safe_name + '.jsonl') - @stream = File.open(outfile, 'w') - end +class JsonDumper < SingleFileLineDumper def dump_msg(dialog, msg) @stream.puts(JSON.generate(msg)) end - def end_dialog(dialog) - @stream.close - @stream = nil + def get_file_extension + '.jsonl' end end diff --git a/dumpers/pisg/dumper.rb b/dumpers/pisg/dumper.rb index f2feff6..f557c7a 100644 --- a/dumpers/pisg/dumper.rb +++ b/dumpers/pisg/dumper.rb @@ -3,7 +3,7 @@ class PisgDumper < DailyFileDumper - def start_dialog(dialog) + def start_dialog(dialog, progress) super @users = {} @oldest_message_date = nil @@ -34,11 +34,13 @@ def end_dialog(dialog) path = File.join(@output_dir, 'oldest_message_date') File.open(path, 'w') {|f| f.write(@oldest_message_date.utc.iso8601) } end + + nil end def dump_msg(dialog, msg) - super - return unless msg['date'] and msg['from'] + return unless super + return unless msg['from'] return if msg['from']['print_name'].to_s == '' @oldest_message_date = Time.at(msg['date']) @users[msg['from']['id']] = msg['from'] diff --git a/dumpers/plaintext/dumper.rb b/dumpers/plaintext/dumper.rb index 2181fef..a923099 100644 --- a/dumpers/plaintext/dumper.rb +++ b/dumpers/plaintext/dumper.rb @@ -3,7 +3,7 @@ class PlaintextDumper < DailyFileDumper def dump_msg(dialog, msg) - super + return unless super date_str = Time.at(msg['date']).strftime('[%s] ' % $config['date_format']) from_name = get_full_name(msg['from']) diff --git a/dumpers/single_file_line_dumper.rb b/dumpers/single_file_line_dumper.rb new file mode 100644 index 0000000..5eddad4 --- /dev/null +++ b/dumpers/single_file_line_dumper.rb @@ -0,0 +1,29 @@ +require_relative 'dumper_interface' + +class SingleFileLineDumper < DumperInterface + + def start_dialog(dialog, progress) + @state = progress.dumper_state ? progress.dumper_state.clone : {} + outfile = @state['outfile'] + if outfile + @prepender = DumpPrepender.new(outfile) + else + safe_name = get_safe_name(dialog['print_name']) + outfile = File.join(get_backup_dir, safe_name + get_file_extension) + @state['outfile'] = outfile + end + @stream = File.open(outfile, 'w') + end + + def end_dialog(dialog) + @stream.close + @stream = nil + @prepender.merge if @prepender + @state + end + + def get_file_extension + raise 'get_file_extension must be implemented' + end + +end diff --git a/lib/dump_prepender.rb b/lib/dump_prepender.rb new file mode 100644 index 0000000..d5fc55c --- /dev/null +++ b/lib/dump_prepender.rb @@ -0,0 +1,18 @@ +require 'fileutils' + +class DumpPrepender + + def initialize(filename) + @mainfile = filename + @tmpfile = filename + '.old' + FileUtils.mv(@mainfile, @tmpfile) + end + + def merge + File.open(@mainfile, 'a') do |outstream| + IO.copy_stream(@tmpfile, outstream) + end + File.delete(@tmpfile) + end + +end diff --git a/lib/dump_progress.rb b/lib/dump_progress.rb new file mode 100644 index 0000000..994e268 --- /dev/null +++ b/lib/dump_progress.rb @@ -0,0 +1,40 @@ +require 'json' + +class DumpProgress + + attr_reader :last_id + attr_reader :last_date + attr_reader :dumper_state + attr_writer :dumper_state + + def initialize(last_id = nil, last_date = nil, dumper_state = {}) + @last_id = last_id + @last_date = last_date + @dumper_state = dumper_state + end + + def self.from_hash(hash) + self.new(hash['last_id'], hash['last_date'], hash['dumper_state']) + end + + def to_hash + { + :last_id => @last_id, + :last_date => @last_date, + :dumper_state => @dumper_state + } + end + + def to_json(*a) + to_hash.to_json(*a) + end + + def bump_id(id) + @last_id = id if !@last_id || id > @last_id + end + + def bump_date(date) + @last_date = date if !@last_date || date > @last_date + end + +end diff --git a/lib/util.rb b/lib/util.rb index 997aa12..e9a3dbf 100644 --- a/lib/util.rb +++ b/lib/util.rb @@ -1,3 +1,5 @@ +require_relative './dump_prepender' + def get_safe_name(name) name.gsub(/([\/\\<>:"|?*]|[^\u0021-\uFFFF])/, $config['character_substitute']) end diff --git a/telegram-history-dump.rb b/telegram-history-dump.rb index 7ae7c96..aeeb949 100755 --- a/telegram-history-dump.rb +++ b/telegram-history-dump.rb @@ -2,12 +2,13 @@ require 'fileutils' require 'json' -require 'socket' require 'logger' +require 'socket' require 'timeout' require 'yaml' -require_relative 'lib/util' require_relative 'lib/cli_parser' +require_relative 'lib/dump_progress' +require_relative 'lib/util' cli_opts = CliParser.parse(ARGV) @@ -38,7 +39,10 @@ def dump_dialog(dialog) if $config['download_media'].values.any? && $config['copy_media'] FileUtils.mkdir_p(get_media_dir(dialog)) end - $dumper.start_dialog(dialog) + id_str = dialog['id'].to_s + old_progress = $progress_snapshot[id_str] || DumpProgress.new + cur_progress = ($progress[id_str] ||= DumpProgress.new) + $dumper.start_dialog(dialog, old_progress) filter_regex = $config['filter_regex'] && eval($config['filter_regex']) offset = 0 keep_dumping = true @@ -55,11 +59,38 @@ def dump_dialog(dialog) end raise 'Expected array' unless msg_chunk.is_a?(Array) msg_chunk.reverse_each do |msg| - $log.warn('Message without date: %s' % msg) unless msg['date'] - unless msg['text'] && filter_regex && filter_regex =~ msg['text'] + dump_msg = true + if msg['id'] + cur_progress.bump_id(msg['id']) + else + $log.warn('Dropping message without id: %s' % msg) + dump_msg = false + end + if msg['date'] + cur_progress.bump_date(msg['date']) + else + $log.warn('Message without date: %s' % msg) + end + + if msg['text'] && filter_regex && filter_regex =~ msg['text'] + dump_msg = false + end + + unless $dumper.msg_fresh?(msg, old_progress) + if keep_dumping + $log.info('Reached end of new messages since last backup') + end + dump_msg = false + keep_dumping = false + end + + if dump_msg process_media(dialog, msg) - $dumper.dump_msg(dialog, msg) + if $dumper.dump_msg(dialog, msg) == false + keep_dumping = false + end end + offset += 1 if $config['backlog_limit'] > 0 && offset >= $config['backlog_limit'] keep_dumping = false @@ -69,7 +100,8 @@ def dump_dialog(dialog) keep_dumping = false if msg_chunk.length < $config['chunk_size'] sleep($config['chunk_delay']) if keep_dumping end - $dumper.end_dialog(dialog) + state = $dumper.end_dialog(dialog) || {} + cur_progress.dumper_state=(state) end def process_media(dialog, msg) @@ -148,6 +180,21 @@ def format_dialog_list(dialogs) FileUtils.mkdir_p(get_backup_dir) +$progress = {} +$progress_snapshot = {} +progress_file = File.join(get_backup_dir, 'progress.json') +progress_json = File.exists?(progress_file) ? File.read(progress_file) : '{}' +progress_hash = JSON.parse(progress_json) +if progress_hash['dumper'] && progress_hash['dumper'] != $config['dumper'] + raise 'Dumper conflict: configured for "%s" but progress file reads "%s". '\ + 'Either use the same dumper or delete the output directory.'\ + % [progress_hash['dumper'], $config['dumper']] +end +(progress_hash['dialogs'] || {}).each do |k,v| + $progress[k] = DumpProgress.from_hash(v) + $progress_snapshot[k] = DumpProgress.from_hash(v) +end + $log.info('Loading dumper module \'%s\'' % $config['dumper']) require_relative 'dumpers/%s/dumper.rb' % $config['dumper'] $dumper = Dumper.new @@ -190,6 +237,14 @@ def format_dialog_list(dialogs) end end +$log.info('Saving progress file') +progress_hash = { + :dumper => $config['dumper'], + :dialogs => $progress +} +progress_json = JSON.pretty_generate(progress_hash) + "\n" +File.write(progress_file, progress_json) + $dumper.end_backup if cli_opts.kill_tg connect_socket