From 6cb12b9f379a4208a345c96c28e1dece141bb84e Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Fri, 3 Nov 2023 17:56:56 +0000
Subject: [PATCH] Add crawl date output

---
 src/bilangwriter.cc | 5 +++++
 src/bilangwriter.hh | 1 +
 src/record.cc       | 8 ++++++++
 src/record.hh       | 2 ++
 warc2text_main.cc   | 2 +-
 5 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/bilangwriter.cc b/src/bilangwriter.cc
index ac8c96c..5688000 100644
--- a/src/bilangwriter.cc
+++ b/src/bilangwriter.cc
@@ -93,6 +93,8 @@ namespace warc2text{
             html_file.open(path + "/html.gz");
         if (output_files.count("file"))
             file_file.open(path + "/file.gz");
+        if (output_files.count("date"))
+            date_file.open(path + "/date.gz");
     }
 
     void LangWriter::write(Record const &record, std::string const &chunk) {
@@ -102,6 +104,8 @@ namespace warc2text{
             mime_file.writeLine(record.getHTTPcontentType());
         if (file_file.is_open())
             file_file.writeLine(record.getFilename() + ":" + std::to_string(record.getOffset()) + ":" + std::to_string(record.getSize()));
+        if (date_file.is_open())
+            date_file.writeLine(record.getWARCdate());
         if (html_file.is_open())
             html_file.writeLine(util::encodeBase64(record.getPayload()));
         if (text_file.is_open())
@@ -147,6 +151,7 @@ namespace warc2text{
                  {"l", boost::json::string(chunk.first)},
                  {"u", boost::json::string(record.getURL())},
                  {"c", boost::json::string(record.getHTTPcontentType())},
+                 {"ts", boost::json::string(record.getWARCdate())},
                  {"p", boost::json::string(chunk.second)},
             } << "\n";
         }
diff --git a/src/bilangwriter.hh b/src/bilangwriter.hh
index bb1b6c8..cf80d52 100644
--- a/src/bilangwriter.hh
+++ b/src/bilangwriter.hh
@@ -51,6 +51,7 @@ namespace warc2text {
             GzipWriter text_file;
             GzipWriter html_file;
             GzipWriter file_file;
+            GzipWriter date_file;
         public:
             LangWriter(const std::string& folder, const std::unordered_set<std::string>& output_files);
             void write(const Record& record, const std::string &chunk);
diff --git a/src/record.cc b/src/record.cc
index 1b6206d..2a3806f 100644
--- a/src/record.cc
+++ b/src/record.cc
@@ -75,6 +75,10 @@ namespace warc2text {
             util::toLower(WARCcontentType);
         }
 
+        if (header.count("warc-date") == 1) {
+            WARCdate = header["warc-date"];
+        }
+
         payload_start = last_pos;
         if (header["warc-type"] == "response") {
             // parse HTTP header
@@ -287,6 +291,10 @@ namespace warc2text {
         return recordType;
     }
 
+    const std::string& Record::getWARCdate() const {
+        return WARCdate;
+    }
+
     const std::string& Record::getWARCcontentType() const {
         return WARCcontentType;
     }
diff --git a/src/record.hh b/src/record.hh
index 675fcc9..00069e7 100644
--- a/src/record.hh
+++ b/src/record.hh
@@ -26,6 +26,7 @@ namespace warc2text {
         const std::string& getURL() const;
         const std::string& getRecordType() const;
         const std::string& getWARCcontentType() const;
+        const std::string& getWARCdate() const;
         const std::string& getHTTPcontentType() const;
         const std::string& getCharset() const;
         bool isBroaderDocumentFormat() const;
@@ -70,6 +71,7 @@ namespace warc2text {
         // these are present in the headers, but it's convenient to have them apart also
         std::string recordType;
         std::string WARCcontentType;
+        std::string WARCdate;
         std::string cleanHTTPcontentType;
         std::string charset;
         std::string url;
diff --git a/warc2text_main.cc b/warc2text_main.cc
index 37b4812..6a839b9 100644
--- a/warc2text_main.cc
+++ b/warc2text_main.cc
@@ -57,7 +57,7 @@ void parseArgs(int argc, char *argv[], Options& out) {
                 " -o <output_folder>               Output folder, required\n"
                 " -f <output_files>                List of output files separated by commas\n"
                 "                                  Default (mandatory): \"url,text\"\n"
-                "                                  Optional values: \"mime,html,file\"\n"
+                "                                  Optional values: \"mime,html,file,date\"\n"
                 " --classifier                     Classifier to use: cld2 or fasttext\n"
                 " --fasttext-model <model_file>    Path to FastText model for fasttext classifier\n"
                 " --multilang                      Detect multiple languages in documents (up to 3),\n"