Skip to content

Commit

Permalink
Add crawl date output
Browse files Browse the repository at this point in the history
  • Loading branch information
jelmervdl committed Nov 3, 2023
1 parent 827ea60 commit 6cb12b9
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 1 deletion.
5 changes: 5 additions & 0 deletions src/bilangwriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ namespace warc2text{
html_file.open(path + "/html.gz");
if (output_files.count("file"))
file_file.open(path + "/file.gz");
if (output_files.count("date"))
date_file.open(path + "/date.gz");
}

void LangWriter::write(Record const &record, std::string const &chunk) {
Expand All @@ -102,6 +104,8 @@ namespace warc2text{
mime_file.writeLine(record.getHTTPcontentType());
if (file_file.is_open())
file_file.writeLine(record.getFilename() + ":" + std::to_string(record.getOffset()) + ":" + std::to_string(record.getSize()));
if (date_file.is_open())
date_file.writeLine(record.getWARCdate());
if (html_file.is_open())
html_file.writeLine(util::encodeBase64(record.getPayload()));
if (text_file.is_open())
Expand Down Expand Up @@ -147,6 +151,7 @@ namespace warc2text{
{"l", boost::json::string(chunk.first)},
{"u", boost::json::string(record.getURL())},
{"c", boost::json::string(record.getHTTPcontentType())},
{"ts", boost::json::string(record.getWARCdate())},
{"p", boost::json::string(chunk.second)},
} << "\n";
}
Expand Down
1 change: 1 addition & 0 deletions src/bilangwriter.hh
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ namespace warc2text {
GzipWriter text_file;
GzipWriter html_file;
GzipWriter file_file;
GzipWriter date_file;
public:
LangWriter(const std::string& folder, const std::unordered_set<std::string>& output_files);
void write(const Record& record, const std::string &chunk);
Expand Down
8 changes: 8 additions & 0 deletions src/record.cc
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ namespace warc2text {
util::toLower(WARCcontentType);
}

if (header.count("warc-date") == 1) {
WARCdate = header["warc-date"];
}

payload_start = last_pos;
if (header["warc-type"] == "response") {
// parse HTTP header
Expand Down Expand Up @@ -287,6 +291,10 @@ namespace warc2text {
return recordType;
}

const std::string& Record::getWARCdate() const {
return WARCdate;
}

const std::string& Record::getWARCcontentType() const {
return WARCcontentType;
}
Expand Down
2 changes: 2 additions & 0 deletions src/record.hh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ namespace warc2text {
const std::string& getURL() const;
const std::string& getRecordType() const;
const std::string& getWARCcontentType() const;
const std::string& getWARCdate() const;
const std::string& getHTTPcontentType() const;
const std::string& getCharset() const;
bool isBroaderDocumentFormat() const;
Expand Down Expand Up @@ -70,6 +71,7 @@ namespace warc2text {
// these are present in the headers, but it's convenient to have them apart also
std::string recordType;
std::string WARCcontentType;
std::string WARCdate;
std::string cleanHTTPcontentType;
std::string charset;
std::string url;
Expand Down
2 changes: 1 addition & 1 deletion warc2text_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ void parseArgs(int argc, char *argv[], Options& out) {
" -o <output_folder> Output folder, required\n"
" -f <output_files> List of output files separated by commas\n"
" Default (mandatory): \"url,text\"\n"
" Optional values: \"mime,html,file\"\n"
" Optional values: \"mime,html,file,date\"\n"
" --classifier Classifier to use: cld2 or fasttext\n"
" --fasttext-model <model_file> Path to FastText model for fasttext classifier\n"
" --multilang Detect multiple languages in documents (up to 3),\n"
Expand Down

0 comments on commit 6cb12b9

Please sign in to comment.