-
Notifications
You must be signed in to change notification settings - Fork 0
/
count
executable file
·29 lines (24 loc) · 1.14 KB
/
count
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/usr/bin/env bash
# count - Count character occurrences in file
# Usage: count [-c|--codepoint] <file>
# -c/--codepoint: sort by code point instead of frequency
# Required: uni2ascii sed coreutils(uniq)
f=1
[[ $1 = "-s" ]] && f=0 && shift
! [[ -f "$1" ]] && echo -e "Usage: $0 <file> [-s]\n (-s means sort by code point instead of frequency)\n\nNo input file: $1\nAborting" && exit 2
[[ $2 = '-s' ]] && f=0
((f)) && f1='s/\s*\([0-9][0-9]*\).*u\([0-9A-F]*\)/\1 U\2 u\2/' ||
f1='s/\s*\([0-9][0-9]*\).*u\([0-9A-F]*\)/u\2 U\2 \1/'
((f)) && f2="sort -n |sed 's/^ [ ]*\( .*\)/\1LF u000A/' |sed 's/^/ /' |sed 's/ *\(.\{11,\}\) / \1 /'" ||
f2="sed 's/^ \(.*\)/u000A LF \1/' |sort"
## Represent UTF-8 in \u-escaped hexadecimal ASCII (no newline)
uni2ascii -psaU "$1" |
## Remove \ and insert newlines and remove empty lines
sed -e 's/\\u/\nu/g' |sed '/^$/d' |
## Sort and count unique first 5 characters
sort |uniq -c -w5 |
## Replace whitespace by 1 tab + fill in the decimal & hexadecimal values:
sed -e 's/\W+/\t/g' -e "$f1" |
## Convert 0xHHHH unicode points to UTF-8 as <Uhhhh>, and sort
ascii2uni -aE |eval "$f2"
exit 0