Output of the clustering input network added

eXascaleInfolab · Dec 13, 2017 · 8592e53 · 8592e53
1 parent 9d458b6
commit 8592e53
Show file tree

Hide file tree

Showing 3 changed files with 123 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -37,51 +37,66 @@ Accuracy of the StaTIX type inference (see details in [TInfES](https://github.co
 Usage: ./run.sh [OPTIONS...] <inputfile.rdf>
 Statistical type inference in fully automatic and semi supervised modes
 Options:
- -b,--brief-hints <arg>    Brief hits, possible values:
-                           '--'  - interactive hints
-                           '<filename.ipl>'  - read from the file having
-                           the format for each line:
-                           <indicativity> <property>
-                           where indicativity E [0, 1]; 0 - the property
-                           has no any impact on the entity type, 1 - the
-                           property fully specifies the entity type, line
-                           comments starting with '#' are allowed.
-                           '-[<nopts=INF>]'  - automatic generation of the
-                           hints to the <inpfile_marks.ipl>, where <marks>
-                           is the range of marks (>= 2) on supervision,
-                           which defines the indicativity precision
-                           eps=1/(marks*2): eps=0.167 for 3 marks
- -f,--filter               Filter out from the resulting clusters all
-                           subjects that do not have #type property in the
-                           input dataset, used for the type inference
-                           evaluation
- -g,--ground-truth <arg>   The ground-truth sample (subset of the input
-                           dataset or another similar dataset with the
-                           specified type properties)
- -h,--help                 Show usage
- -m,--multi-level          Output type inference for multiple scales
-                           (representative clusters from all hierarchy
-                           levels) besides the macro scale (top level,
-                           root)
- -n,--id-name <arg>        Output map of the id names (<inpfile>.idm in
-                           tab separated format: <id>
-                           <subject_name>), default: disabled
- -o,--output <arg>         Output file, default: <inpfile>.cnl
- -r,--reduce <arg>         Reduce similarity matrix on graph construction
-                           by non-significant relations to reduce memory
-                           consumption and speedup the clustering.
-                           Options: a - accurate, m - mean, s - severe.
-                           Recommended for large datasets
- -s,--scale <arg>          Scale (resolution, gamma parameter of the
-                           clustering), -1 is automatic scale inference
-                           for each cluster, >=0 is the forced static
-                           scale (<=1 for the macro clustering); default:
-                           -1
- -v,--version              Show version
+ -b,--brief-hints <arg>           Brief hits, possible values:
+                                  '--'  - interactive hints
+                                  '<filename.ipl>'  - read from the file
+                                  having the format for each line:
+                                  <indicativity> <property>
+                                  where indicativity E [0, 1]; 0 - the
+                                  property has no any impact on the entity
+                                  type, 1 - the property fully specifies
+                                  the entity type, line comments starting
+                                  with '#' are allowed.
+                                  '-[<nopts=INF>]'  - automatic generation
+                                  of the hints to the <inpfile_marks.ipl>,
+                                  where <marks> is the range of marks (>=
+                                  2) on supervision, which defines the
+                                  indicativity precision eps=0.5/(marks +
+                                  1): eps=0.167 for 2 marks
+ -e,--extract-groundtruth <arg>   Extract ground-truth (ids of the
+                                  subjects per each type) to the specified
+                                  file in the .cnl format
+ -g,--groundtruth-sample <arg>    The ground-truth sample (subset of the
+                                  input dataset or another similar dataset
+                                  with the specified type properties)
+ -h,--help                        Show usage
+ -i,--network <arg>               Produce .rcg input network file for the
+                                  clustering without the type inference
+                                  itself
+ -m,--multi-level                 Output type inference for multiple
+                                  scales (representative clusters from all
+                                  hierarchy levels) besides the macro
+                                  scale (top level, root)
+ -n,--id-name <arg>               Output map of the id names
+                                  (<inpfile>.idm in tab separated format:
+                                  <id> <subject_name>), default: disabled
+ -o,--output <arg>                Output file, default: <inpfile>.cnl
+ -p,--filter                      Filter out from the resulting clusters
+                                  all subjects that do not have the
+                                  '#type' property in the input dataset,
+                                  used for the type inference evaluation
+ -r,--reduce <arg>                Reduce similarity matrix on graph
+                                  construction by non-significant
+                                  relations to reduce memory consumption
+                                  and speedup the clustering. Options: a -
+                                  accurate, m - mean, s - severe.
+                                  Recommended for large datasets
+ -s,--scale <arg>                 Scale (resolution, gamma parameter of
+                                  the clustering), -1 is automatic scale
+                                  inference for each cluster, >=0 is the
+                                  forced static scale (<=1 for the macro
+                                  clustering); default: -1
+ -u,--unique-triples              Unique triples only are present in the
+                                  ground-truth dataset (natty, clean data
+                                  without duplicates), so there is no need
+                                  of the possible duplicates
+                                  identification and omission
+ -v,--version                     Show version
 ```
 To infer types without the ground-truth available with the implicit output to the `inpDataset.cnl`: `./run.sh inpDataset.rdf`.  
 To infer types with available ground-truth for the sampled reduced dataset or using another typed dataset with similar structure, performing output to the `results.cnl`: `./run.sh -g gtSample.rdf -o results.cnl inpDataset.rdf`.  
-To infer types on multiple resolution levels (besides the whole dataset scope): `./run.sh -a inpDataset.rdf`.
+To infer types on multiple resolution levels (besides the whole dataset scope): `./run.sh -a inpDataset.rdf`.  
+To produce the input network in the [.rcg format](https://github.com/eXascaleInfolab/PyNetConvert#rcg) for the clustering without the type inference itself: `$ ./run.sh -f -p networks/gendr-f.rcg datasets/biomedical/gendr.rdf`.
 
 ### Compilation
 

diff --git a/src/info/exascale/statix/Statix.java b/src/info/exascale/statix/Statix.java
@@ -47,6 +47,7 @@ static double round(double val, int range) {
 
 	public static final String  extHints = ".ipl";  // Default extension of the hints file (indicativity of the property per line)
 	public static final String  extCls = ".cnl";  // Default extension of the clusters (inferred types) file (indicativity of the property per line)
+	public static final String  extNet = ".rcg";  // Default extension for the network (clustering input) file
 
 	private static final boolean  tracingOn = false;  // Enable tracing
 	private CosineSimilarityMatix  csmat = new CosineSimilarityMatix();
@@ -344,9 +345,9 @@ public void loadDatasets(String inpfname, String lblfname, boolean filteringOn,
 
 	protected Graph buildGraph() {
 		final Set<String>  instances = csmat.instances();
-		final int instsNum = instances.size();
-		Graph gr = new Graph(instsNum);
-		InpLinks grInpLinks  = new InpLinks();
+		final int  instsNum = instances.size();
+		Graph  gr = new Graph(instsNum);
+		InpLinks  grInpLinks = new InpLinks();
 
 		// Note: Java iterators are not copyable and there is not way to get iterator to the custom item,
 		// so even for the symmetric matrix all iterations should be done
@@ -378,6 +379,48 @@ protected Graph buildGraph() {
 		return gr;
 	}
 
+	public void saveNet(String outputPath) throws IOException {
+		try(
+			BufferedWriter  netf = Files.newBufferedWriter(Paths.get(outputPath));  // new BufferedWriter(new FileWriter(idMapFName))
+		) {
+			final Set<String>  instances = csmat.instances();
+			final int  instsNum = instances.size();
+
+			// Write .rcg header
+			netf.write("/Graph weighted:1 validated:1\n/Nodes " + instsNum + "\n/Edges\n");
+
+			// Note: Java iterators are not copyable and there is not way to get iterator to the custom item,
+			// so even for the symmetric matrix all iterations should be done
+			int i = 0;
+			for (String inst1: instances) {
+				final long  sid = csmat.instanceId(inst1);  // Source node id
+				boolean  initial = true;  // First item in the line
+				int  j = 0;
+				for (String inst2: instances) {
+					if(j > i) {  // Skip back links
+						if(initial) {
+							initial = false;
+							netf.write(sid + ">");
+						}
+						final float  weight = (float)csmat.similarity(inst1, inst2);
+						if(weight == 0)
+							continue;
+						final long did = csmat.instanceId(inst2);
+						//System.out.print(" " + did + ":" + weight);
+						//if(weight <= 0 || Float.isNaN(weight))
+						//	throw new IllegalArgumentException("Weight for #(" + inst1 + ", " + inst2 + ") is out of range: " + weight);
+						netf.write(" " + did + ":" + weight);
+					}
+					++j;
+				}
+				if(!initial)
+					netf.write("\n");
+				++i;
+			}
+			System.err.println("The network is saved into: " + outputPath);
+		}
+	}
+
 	public void cluster(String outputPath, float scale, boolean multiLev, char reduction, boolean filteringOn) throws Exception {
 		System.err.println("Calling the clustering lib...");
 		Graph gr = buildGraph();

diff --git a/src/info/exascale/statix/main.java b/src/info/exascale/statix/main.java
@@ -3,6 +3,8 @@
 import java.text.ParseException;
 import java.nio.file.Files;
 import java.nio.file.Paths;
+import java.io.IOException;
+import java.io.File;
 
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
@@ -35,14 +37,18 @@ public static void main(String[] args) throws Exception {
 		options.addOption("m", "multi-level", false, "Output type inference for multiple scales (representative clusters from all hierarchy levels) besides the macro scale (top level, root)");
 		options.addOption("s", "scale", true, "Scale (resolution, gamma parameter of the clustering), -1 is automatic scale inference for each cluster, >=0 is the forced static scale (<=1 for the macro clustering); default: -1");
 		options.addOption("r", "reduce", true, "Reduce similarity matrix on graph construction by non-significant relations to reduce memory consumption and speedup the clustering. Options: a - accurate, m - mean, s - severe. Recommended for large datasets");
-		options.addOption("f", "filter", false, "Filter out from the resulting clusters all subjects that do not have #type property in the input dataset, used for the type inference evaluation");
+		options.addOption("f", "filter", false, "Filter out from the resulting clusters all subjects that do not have the '#type' property in the input dataset, used for the type inference evaluation");
 		options.addOption("e", "extract-groundtruth", true, "Extract ground-truth (ids of the subjects per each type) to the specified file in the " + Statix.extCls + " format");
 		options.addOption("u", "unique-triples", false, "Unique triples only are present in the ground-truth dataset (natty, clean data without duplicates), so there is no need of the possible duplicates identification and omission");
+		options.addOption("p", "network", true, "Produce .rcg input network file for the clustering without the type inference itself");
 		options.addOption("v", "version", false, "Show version");
 
 		HelpFormatter formatter = new HelpFormatter();
 		String[] argsOpt = new String[]{"args"};
-		final String appusage = main.class.getCanonicalName() + " [OPTIONS...] <inputfile.rdf>";
+		final String appusage = //main.class.getCanonicalName()
+			//new File(main.class.getProtectionDomain().getCodeSource()
+			//.getLocation().getPath()).getName() +
+			"./run.sh [OPTIONS...] <inputfile.rdf>";
 		final String desription = "Statistical type inference in fully automatic and semi supervised modes\nOptions:";
 		final String reference = "\nSee details in https://github.com/eXascaleInfolab/StaTIX";
 		Statix  statix = new Statix();
@@ -154,8 +160,18 @@ public static void main(String[] args) throws Exception {
 				}
 			}
 
-			// Perform type inference
-			statix.cluster(outpfile, scale, cmd.hasOption("m"), reduction, filteringOn);
+			if(cmd.hasOption("p")) {
+				// Construct and output the input network for the subsequent clustering without the type inference itself
+				final String  netfile = cmd.getOptionValue("p");
+				try {
+					statix.saveNet(netfile);
+				} catch(IOException e) {
+					System.err.println("ERROR on saving to the network file (" + netfile + "):\n");
+					e.printStackTrace();
+					System.exit(1);
+				}
+			} else  // Perform type inference
+				statix.cluster(outpfile, scale, cmd.hasOption("m"), reduction, filteringOn);
 		}
 		catch (ParseException e) {  //  | IllegalArgumentException
 			e.printStackTrace();