-
Notifications
You must be signed in to change notification settings - Fork 3
/
url-encode.lisp
128 lines (120 loc) · 5.98 KB
/
url-encode.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
(defpackage :url-encode
(:use :cl)
(:export #:url-decode #:url-encode #:url-error))
(in-package :url-encode)
(define-condition url-error (simple-error) ())
(defun url-error (format &rest args)
(error 'url-error :format-control format :format-arguments args))
(defun char-utf-8-byte-length (char)
(let ((code (char-code char)))
(cond ((< code 128) 1)
((< code 2048) 2)
((< code 65536) 3)
(t 4))))
(defmacro as-utf-8-bytes (char writer)
"Given a character, calls the writer function for every byte in the
encoded form of that character."
(let ((char-code (gensym)))
`(let ((,char-code (char-code ,char)))
(declare (type fixnum ,char-code))
(cond ((< ,char-code 128)
(,writer ,char-code))
((< ,char-code 2048)
(,writer (logior #b11000000 (ldb (byte 5 6) ,char-code)))
(,writer (logior #b10000000 (ldb (byte 6 0) ,char-code))))
((< ,char-code 65536)
(,writer (logior #b11100000 (ldb (byte 4 12) ,char-code)))
(,writer (logior #b10000000 (ldb (byte 6 6) ,char-code)))
(,writer (logior #b10000000 (ldb (byte 6 0) ,char-code))))
(t
(,writer (logior #b11110000 (ldb (byte 3 18) ,char-code)))
(,writer (logior #b10000000 (ldb (byte 6 12) ,char-code)))
(,writer (logior #b10000000 (ldb (byte 6 6) ,char-code)))
(,writer (logior #b10000000 (ldb (byte 6 0) ,char-code))))))))
(defun url-encode (string &optional (to-escape "\"#$%&+,/:;<=>?@"))
(declare (optimize speed (safety 0)))
(let ((size (loop :for ch :across string :for code := (char-code ch) :sum
(cond ((> code 127) (* (char-utf-8-byte-length ch) 3))
((or (< code 33) (find ch to-escape)) 3)
(t 1)))))
(if (= size (length string))
string
(let ((out (make-string size)) (pos 0))
(macrolet ((wr (ch) `(progn (setf (schar out pos) ,ch) (incf pos))))
(flet ((code-out (ch)
(multiple-value-bind (hi lo) (floor ch 16)
(wr #\%) (wr (digit-char hi 16)) (wr (digit-char lo 16)))))
(loop :for ch :across string :for code := (char-code ch) :do
(cond ((> code 127) (as-utf-8-bytes ch code-out))
((or (< code 33) (find ch to-escape)) (code-out code))
(t (wr ch))))))
out))))
(defun utf-8-group-size (byte)
"Determine the amount of bytes that are part of the character
starting with a given byte."
(declare (type fixnum byte))
(cond ((zerop (logand byte #b10000000)) 1)
((= (logand byte #b11100000) #b11000000) 2)
((= (logand byte #b11110000) #b11100000) 3)
((= (logand byte #b11111000) #b11110000) 4)
(t (url-error "Invalid UTF-8 byte: 0x~X" byte))))
(defun get-utf-8-character (bytes group-size &aux (start 0))
"Given an array of bytes and the amount of bytes to use,
extract the character they denote."
(declare (type (simple-array (unsigned-byte 8) (*)) bytes)
(type fixnum group-size))
(macrolet ((next-byte ()
'(prog1 (elt bytes start)
(incf start)))
(six-bits (byte)
(let ((b (gensym)))
`(let ((,b ,byte))
(unless (= (logand ,b #b11000000) #b10000000)
(url-error "Invalid byte 0x~X inside a character." ,b))
(ldb (byte 6 0) ,b))))
(test-overlong (byte min-size)
(let ((b (gensym)))
`(let ((,b ,byte))
(unless (> ,b ,min-size)
(url-error "Overlong UTF-8 byte sequence found."))
,b))))
(ecase group-size
(1 (next-byte))
(2 (test-overlong (logior (ash (ldb (byte 5 0) (next-byte)) 6)
(six-bits (next-byte))) 128))
(3 (test-overlong (logior (ash (ldb (byte 4 0) (next-byte)) 12)
(ash (six-bits (next-byte)) 6)
(six-bits (next-byte))) 2048))
(4 (test-overlong (logior (ash (ldb (byte 3 0) (next-byte)) 18)
(ash (six-bits (next-byte)) 12)
(ash (six-bits (next-byte)) 6)
(six-bits (next-byte))) 65536)))))
(defun url-decode (string &optional (leave ""))
(declare (optimize speed (safety 0)))
(let ((buf (make-string (length string)))
(pos 0)
(utf-buf (make-array 4 :element-type '(unsigned-byte 8))))
(declare (fixnum pos) (simple-string buf))
(with-input-from-string (in string)
(loop :for ch := (read-char in nil nil) :while ch :do
(macrolet ((hex ()
'(let ((big (digit-char-p (read-char in nil #\x) 16))
(small (digit-char-p (read-char in nil #\x) 16)))
(unless (and big small) (url-error "Junk in URL."))
(+ small (ash big 4))))
(out (x) `(progn (setf (schar buf pos) ,x) (incf pos))))
(case ch
(#\+ (out #\space))
(#\% (let* ((code (hex))
(group (utf-8-group-size code)))
(cond ((and (eql group 1) (find (code-char code) leave))
(multiple-value-bind (hi lo) (floor code 16)
(out #\%) (out (digit-char hi 16)) (out (digit-char lo 16))))
(t (setf (aref utf-buf 0) code)
(loop :for i :from 1 :below group :do
(unless (eql (read-char in nil nil) #\%)
(url-error "Nonsense UTF-8 code in URL."))
(setf (aref utf-buf i) (hex)))
(out (code-char (get-utf-8-character utf-buf group)))))))
(t (out ch))))
:finally (return (if (eql pos (length buf)) buf (subseq buf 0 pos)))))))