Fixed quadratic behaviour

Now enumerate all the input files, sorts them by their basename (to group the same basenames together), then combines all the files with the same basename. Unfortunately this doesn't seem to provide any speedup, probably because I'm running it through cygwin which does weird stuff with file I/O.
Arcton · Jul 30, 2016 · a5adb11 · a5adb11 · jtsymon · Jul 30, 2016
1 parent fb39ed2
commit a5adb11
Showing 1 changed file with 17 additions and 16 deletions.
diff --git a/combiner.sh b/combiner.sh
@@ -1,20 +1,21 @@
 #!/usr/bin/env sh
 
 mkdir -p combined-datasets
-for area in $(find processed-datasets/ -type f -printf '%f\n' | sort -u); do
-    (
-        echo '{'
-        first=1
-        for dataset in $(find processed-datasets/ -name "$area"); do
-            if [[ $first -eq 1 ]]; then
-                first=0
-            else
-                echo ","
-            fi
-            echo -n '"'$(basename $(dirname $dataset))'":'
-            cat $dataset
-        done
-        echo
-        echo '}'
-    ) > combined-datasets/$area
+unset area
+unset file
+# sort files by their basename
+find processed-datasets/ -type f | perl -e 'print sort{($p=$a)=~s!.*/!!;($q=$b)=~s!.*/!!;$p cmp$q}<>' | while read path; do
+    _area=$(basename $path)
+    if [[ "$_area" != "$area" ]]; then
+        area=$_area
+        if [[ "$file" != "" ]]; then
+            echo "}" >&3
+        fi
+        file=combined-datasets/$area
+        exec 3<> $file
+        echo "{" >&3
+    else
+        echo "," >&3
+    fi
+    cat $path >&3
 done