Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
lostfictions committed Aug 14, 2023
0 parents commit a866bb1
Show file tree
Hide file tree
Showing 4 changed files with 199 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
/target

/data
117 changes: 117 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 11 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[package]
name = "conceptnet-trim"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
csv = "1.2.2"
serde = { version = "1.0.183", features = ["derive"] }
serde_json = "1.0.104"
68 changes: 68 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
use csv::ReaderBuilder;
use serde::ser::{SerializeSeq, Serializer};
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::error::Error;
use std::fs::File;
use std::io::{BufWriter, Write};

#[derive(Debug, Deserialize)]
struct InRecord {
_uri: String,
rel: String,
start: String,
end: String,
data: String,
}

#[derive(Debug, Serialize)]
struct OutRecord {
rel: String,
start: String,
end: String,
weight: Option<f64>,
}

fn main() -> Result<(), Box<dyn Error>> {
let mut rdr = ReaderBuilder::new()
.has_headers(false)
.delimiter(b'\t')
.quoting(false)
.from_path("data/assertions.csv")?;

let file = File::create("data/trimmed.json")?;
let mut writer = BufWriter::new(file);

let mut ser = serde_json::Serializer::pretty(&mut writer);

let mut read_rows = 0u64;
let mut write_rows = 0u64;

let mut seq = ser.serialize_seq(None)?;
for result in rdr.deserialize() {
let in_rec: InRecord = result?;
read_rows += 1;
if read_rows % 1_000_000 == 0 {
println!("row {:?}", read_rows);
}

if in_rec.start.starts_with("/c/en") && in_rec.end.starts_with("/c/en") {
let data: Value = serde_json::from_str(&in_rec.data)?;

let out_rec = OutRecord {
rel: in_rec.rel,
start: in_rec.start,
end: in_rec.end,
weight: data["weight"].as_f64(),
};

seq.serialize_element(&out_rec)?;
write_rows += 1;
}
}
seq.end()?;
writer.flush()?;
println!("read {:?} rows, wrote {:?} rows", read_rows, write_rows);

Ok(())
}

0 comments on commit a866bb1

Please sign in to comment.