From 6cb12b9f379a4208a345c96c28e1dece141bb84e Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Fri, 3 Nov 2023 17:56:56 +0000 Subject: [PATCH] Add crawl date output --- src/bilangwriter.cc | 5 +++++ src/bilangwriter.hh | 1 + src/record.cc | 8 ++++++++ src/record.hh | 2 ++ warc2text_main.cc | 2 +- 5 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/bilangwriter.cc b/src/bilangwriter.cc index ac8c96c..5688000 100644 --- a/src/bilangwriter.cc +++ b/src/bilangwriter.cc @@ -93,6 +93,8 @@ namespace warc2text{ html_file.open(path + "/html.gz"); if (output_files.count("file")) file_file.open(path + "/file.gz"); + if (output_files.count("date")) + date_file.open(path + "/date.gz"); } void LangWriter::write(Record const &record, std::string const &chunk) { @@ -102,6 +104,8 @@ namespace warc2text{ mime_file.writeLine(record.getHTTPcontentType()); if (file_file.is_open()) file_file.writeLine(record.getFilename() + ":" + std::to_string(record.getOffset()) + ":" + std::to_string(record.getSize())); + if (date_file.is_open()) + date_file.writeLine(record.getWARCdate()); if (html_file.is_open()) html_file.writeLine(util::encodeBase64(record.getPayload())); if (text_file.is_open()) @@ -147,6 +151,7 @@ namespace warc2text{ {"l", boost::json::string(chunk.first)}, {"u", boost::json::string(record.getURL())}, {"c", boost::json::string(record.getHTTPcontentType())}, + {"ts", boost::json::string(record.getWARCdate())}, {"p", boost::json::string(chunk.second)}, } << "\n"; } diff --git a/src/bilangwriter.hh b/src/bilangwriter.hh index bb1b6c8..cf80d52 100644 --- a/src/bilangwriter.hh +++ b/src/bilangwriter.hh @@ -51,6 +51,7 @@ namespace warc2text { GzipWriter text_file; GzipWriter html_file; GzipWriter file_file; + GzipWriter date_file; public: LangWriter(const std::string& folder, const std::unordered_set& output_files); void write(const Record& record, const std::string &chunk); diff --git a/src/record.cc b/src/record.cc index 1b6206d..2a3806f 100644 --- a/src/record.cc +++ b/src/record.cc @@ -75,6 +75,10 @@ namespace warc2text { util::toLower(WARCcontentType); } + if (header.count("warc-date") == 1) { + WARCdate = header["warc-date"]; + } + payload_start = last_pos; if (header["warc-type"] == "response") { // parse HTTP header @@ -287,6 +291,10 @@ namespace warc2text { return recordType; } + const std::string& Record::getWARCdate() const { + return WARCdate; + } + const std::string& Record::getWARCcontentType() const { return WARCcontentType; } diff --git a/src/record.hh b/src/record.hh index 675fcc9..00069e7 100644 --- a/src/record.hh +++ b/src/record.hh @@ -26,6 +26,7 @@ namespace warc2text { const std::string& getURL() const; const std::string& getRecordType() const; const std::string& getWARCcontentType() const; + const std::string& getWARCdate() const; const std::string& getHTTPcontentType() const; const std::string& getCharset() const; bool isBroaderDocumentFormat() const; @@ -70,6 +71,7 @@ namespace warc2text { // these are present in the headers, but it's convenient to have them apart also std::string recordType; std::string WARCcontentType; + std::string WARCdate; std::string cleanHTTPcontentType; std::string charset; std::string url; diff --git a/warc2text_main.cc b/warc2text_main.cc index 37b4812..6a839b9 100644 --- a/warc2text_main.cc +++ b/warc2text_main.cc @@ -57,7 +57,7 @@ void parseArgs(int argc, char *argv[], Options& out) { " -o Output folder, required\n" " -f List of output files separated by commas\n" " Default (mandatory): \"url,text\"\n" - " Optional values: \"mime,html,file\"\n" + " Optional values: \"mime,html,file,date\"\n" " --classifier Classifier to use: cld2 or fasttext\n" " --fasttext-model Path to FastText model for fasttext classifier\n" " --multilang Detect multiple languages in documents (up to 3),\n"