Skip to content

Commit

Permalink
feat: Cyrillic and greek support
Browse files Browse the repository at this point in the history
  • Loading branch information
Ludmil Simeonov committed Jul 19, 2024
1 parent 5481aef commit b38b781
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 15 deletions.
67 changes: 53 additions & 14 deletions parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"fmt"
"strconv"
"unicode"
"unicode/utf8"
)

// A XPath expression token type.
Expand Down Expand Up @@ -228,8 +229,9 @@ Loop:
}

// RelationalExpr ::= AdditiveExpr | RelationalExpr '<' AdditiveExpr | RelationalExpr '>' AdditiveExpr
// | RelationalExpr '<=' AdditiveExpr
// | RelationalExpr '>=' AdditiveExpr
//
// | RelationalExpr '<=' AdditiveExpr
// | RelationalExpr '>=' AdditiveExpr
func (p *parser) parseRelationalExpr(n node) node {
opnd := p.parseAdditiveExpr(n)
Loop:
Expand Down Expand Up @@ -274,7 +276,8 @@ Loop:
}

// MultiplicativeExpr ::= UnaryExpr | MultiplicativeExpr MultiplyOperator(*) UnaryExpr
// | MultiplicativeExpr 'div' UnaryExpr | MultiplicativeExpr 'mod' UnaryExpr
//
// | MultiplicativeExpr 'div' UnaryExpr | MultiplicativeExpr 'mod' UnaryExpr
func (p *parser) parseMultiplicativeExpr(n node) node {
opnd := p.parseUnaryExpr(n)
Loop:
Expand Down Expand Up @@ -308,7 +311,7 @@ func (p *parser) parseUnaryExpr(n node) node {
return opnd
}

// UnionExpr ::= PathExpr | UnionExpr '|' PathExpr
// UnionExpr ::= PathExpr | UnionExpr '|' PathExpr
func (p *parser) parseUnionExpr(n node) node {
opnd := p.parsePathExpr(n)
Loop:
Expand Down Expand Up @@ -352,7 +355,7 @@ func (p *parser) parseFilterExpr(n node) node {
return opnd
}

// Predicate ::= '[' PredicateExpr ']'
// Predicate ::= '[' PredicateExpr ']'
func (p *parser) parsePredicate(n node) node {
p.skipItem(itemLBracket)
opnd := p.parseExpression(n)
Expand Down Expand Up @@ -447,7 +450,7 @@ func (p *parser) parseSequence(n node) (opnd node) {
return opnd
}

// NodeTest ::= NameTest | nodeType '(' ')' | 'processing-instruction' '(' Literal ')'
// NodeTest ::= NameTest | nodeType '(' ')' | 'processing-instruction' '(' Literal ')'
func (p *parser) parseNodeTest(n node, axeTyp string) (opnd node) {
switch p.r.typ {
case itemName:
Expand Down Expand Up @@ -672,6 +675,7 @@ type scanner struct {

pos int
curr rune
currSize int
typ itemType
strval string // text value at current pos
numval float64 // number value at current pos
Expand All @@ -681,10 +685,18 @@ type scanner struct {
func (s *scanner) nextChar() bool {
if s.pos >= len(s.text) {
s.curr = rune(0)
s.currSize = 1
return false
}
s.curr = rune(s.text[s.pos])
s.pos++

r, size := rune(s.text[s.pos]), 1
if r >= 0x80 { // handle multi-byte runes
r, size = utf8.DecodeRuneInString(s.text[s.pos:])
}

s.curr = r
s.currSize = size
s.pos += size
return true
}

Expand Down Expand Up @@ -843,34 +855,45 @@ func (s *scanner) scanString() string {
end = s.curr
)
s.nextChar()
i := s.pos - 1
i := s.pos - s.currSize
if s.currSize > 1 {
c++
}
for s.curr != end {
if !s.nextChar() {
panic(errors.New("xpath: scanString got unclosed string"))
}
c++
c += s.currSize
}
s.nextChar()
return s.text[i : i+c]
}

func (s *scanner) scanName() string {
var (
c int
i = s.pos - 1
c = s.currSize - 1
i = s.pos - s.currSize
)

// Detect current rune size

for isName(s.curr) {
c++
if !s.nextChar() {
c += s.currSize
break
}
c += s.currSize
}
return s.text[i : i+c]
}

func isName(r rune) bool {
return string(r) != ":" && string(r) != "/" &&
(unicode.Is(first, r) || unicode.Is(second, r) || string(r) == "*")
(unicode.Is(first, r) ||
unicode.Is(second, r) ||
unicode.Is(cyrillic, r) ||
unicode.Is(greek, r) ||
string(r) == "*")
}

func isDigit(r rune) bool {
Expand Down Expand Up @@ -1218,3 +1241,19 @@ var second = &unicode.RangeTable{
{0x30FC, 0x30FE, 1},
},
}

var cyrillic = &unicode.RangeTable{
R16: []unicode.Range16{
{Lo: 0x0400, Hi: 0x04FF, Stride: 1}, // Cyrillic
{Lo: 0x0500, Hi: 0x052F, Stride: 1}, // Cyrillic Supplement
{Lo: 0x2DE0, Hi: 0x2DFF, Stride: 1}, // Cyrillic Extended-A
{Lo: 0xA640, Hi: 0xA69F, Stride: 1}, // Cyrillic Extended-B
},
}

var greek = &unicode.RangeTable{
R16: []unicode.Range16{
{Lo: 0x0370, Hi: 0x03FF, Stride: 1}, // Greek and Coptic
{Lo: 0x1F00, Hi: 0x1FFF, Stride: 1}, // Greek Extended
},
}
2 changes: 1 addition & 1 deletion xpath_axes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ func Test_self(t *testing.T) {
}

func Test_child(t *testing.T) {
test_xpath_elements(t, employee_example, `//child::employee/child::email`, 6, 11, 16)
test_xpath_elements(t, employee_example, `/empinfo/child::*`, 3, 8, 13)
test_xpath_elements(t, employee_example, `/empinfo/child::node()`, 3, 8, 13)
test_xpath_values(t, employee_example, `//name/child::text()`, "Opal Kole", "Max Miller", "Beccaa Moss")
test_xpath_elements(t, employee_example, `//child::employee/child::email`, 6, 11, 16)
}

func Test_descendant(t *testing.T) {
Expand Down
49 changes: 49 additions & 0 deletions xpath_expression_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,52 @@ func TestSequence(t *testing.T) {
test_xpath_count(t, html_example, `//body/(h1, h2, p)`, 2)
test_xpath_count(t, html_example, `//body/(h1, h2, p, ..)`, 3)
}

func TestLatinAttributesInXPath(t *testing.T) {
doc := createNode("", RootNode)
div := doc.createChildNode("div", ElementNode)
div.addAttribute("language", "english")
div.lines = 1
test_xpath_elements(t, doc, `//div[@language='english']`, 1)
}

func TestCyrillicAttributesInXPath(t *testing.T) {
doc := createNode("", RootNode)
div := doc.createChildNode("div", ElementNode)
div.addAttribute("язык", "русский")
div.lines = 1
test_xpath_elements(t, doc, `//div[@язык='русский']`, 1)
}

func TestGreekAttributesInXPath(t *testing.T) {
doc := createNode("", RootNode)
div := doc.createChildNode("div", ElementNode)
div.addAttribute("γλώσσα", "ελληνικά")
div.lines = 1
test_xpath_elements(t, doc, `//div[@γλώσσα='ελληνικά']`, 1)
}

func TestCyrillicAndGreekAttributesMixedInXPath(t *testing.T) {
doc := createNode("", RootNode)
div := doc.createChildNode("div", ElementNode)
div.addAttribute("язык", "русский")
div.addAttribute("γλώσσα", "ελληνικά")
div.lines = 1
test_xpath_elements(t, doc, `//div[@язык='русский' and @γλώσσα='ελληνικά']`, 1)
}

func TestCyrillicAttributesInXPath_NoMatch(t *testing.T) {
doc := createNode("", RootNode)
div := doc.createChildNode("div", ElementNode)
div.addAttribute("язык", "русский")
div.lines = 1
test_xpath_elements(t, doc, `//div[@язык='английский']`)
}

func TestGreekAttributesInXPath_NoMatch(t *testing.T) {
doc := createNode("", RootNode)
div := doc.createChildNode("div", ElementNode)
div.addAttribute("γλώσσα", "ελληνικά")
div.lines = 1
test_xpath_elements(t, doc, `//div[@γλώσσα='αγγλικά']`)
}

0 comments on commit b38b781

Please sign in to comment.