Add crawl date output

bitextor · Nov 3, 2023 · 6cb12b9 · 6cb12b9
1 parent 827ea60
commit 6cb12b9
Show file tree

Hide file tree

Showing 5 changed files with 17 additions and 1 deletion.
diff --git a/src/bilangwriter.cc b/src/bilangwriter.cc
@@ -93,6 +93,8 @@ namespace warc2text{
             html_file.open(path + "/html.gz");
         if (output_files.count("file"))
             file_file.open(path + "/file.gz");
+        if (output_files.count("date"))
+            date_file.open(path + "/date.gz");
     }
 
     void LangWriter::write(Record const &record, std::string const &chunk) {
@@ -102,6 +104,8 @@ namespace warc2text{
             mime_file.writeLine(record.getHTTPcontentType());
         if (file_file.is_open())
             file_file.writeLine(record.getFilename() + ":" + std::to_string(record.getOffset()) + ":" + std::to_string(record.getSize()));
+        if (date_file.is_open())
+            date_file.writeLine(record.getWARCdate());
         if (html_file.is_open())
             html_file.writeLine(util::encodeBase64(record.getPayload()));
         if (text_file.is_open())
@@ -147,6 +151,7 @@ namespace warc2text{
                  {"l", boost::json::string(chunk.first)},
                  {"u", boost::json::string(record.getURL())},
                  {"c", boost::json::string(record.getHTTPcontentType())},
+                 {"ts", boost::json::string(record.getWARCdate())},
                  {"p", boost::json::string(chunk.second)},
             } << "\n";
         }

diff --git a/src/bilangwriter.hh b/src/bilangwriter.hh
@@ -51,6 +51,7 @@ namespace warc2text {
             GzipWriter text_file;
             GzipWriter html_file;
             GzipWriter file_file;
+            GzipWriter date_file;
         public:
             LangWriter(const std::string& folder, const std::unordered_set<std::string>& output_files);
             void write(const Record& record, const std::string &chunk);

diff --git a/src/record.cc b/src/record.cc
@@ -75,6 +75,10 @@ namespace warc2text {
             util::toLower(WARCcontentType);
         }
 
+        if (header.count("warc-date") == 1) {
+            WARCdate = header["warc-date"];
+        }
+
         payload_start = last_pos;
         if (header["warc-type"] == "response") {
             // parse HTTP header
@@ -287,6 +291,10 @@ namespace warc2text {
         return recordType;
     }
 
+    const std::string& Record::getWARCdate() const {
+        return WARCdate;
+    }
+
     const std::string& Record::getWARCcontentType() const {
         return WARCcontentType;
     }

diff --git a/src/record.hh b/src/record.hh
@@ -26,6 +26,7 @@ namespace warc2text {
         const std::string& getURL() const;
         const std::string& getRecordType() const;
         const std::string& getWARCcontentType() const;
+        const std::string& getWARCdate() const;
         const std::string& getHTTPcontentType() const;
         const std::string& getCharset() const;
         bool isBroaderDocumentFormat() const;
@@ -70,6 +71,7 @@ namespace warc2text {
         // these are present in the headers, but it's convenient to have them apart also
         std::string recordType;
         std::string WARCcontentType;
+        std::string WARCdate;
         std::string cleanHTTPcontentType;
         std::string charset;
         std::string url;

diff --git a/warc2text_main.cc b/warc2text_main.cc
@@ -57,7 +57,7 @@ void parseArgs(int argc, char *argv[], Options& out) {
                 " -o <output_folder>               Output folder, required\n"
                 " -f <output_files>                List of output files separated by commas\n"
                 "                                  Default (mandatory): \"url,text\"\n"
-                "                                  Optional values: \"mime,html,file\"\n"
+                "                                  Optional values: \"mime,html,file,date\"\n"
                 " --classifier                     Classifier to use: cld2 or fasttext\n"
                 " --fasttext-model <model_file>    Path to FastText model for fasttext classifier\n"
                 " --multilang                      Detect multiple languages in documents (up to 3),\n"