Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Cyrillic and Greek support #100

Merged
merged 2 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 53 additions & 14 deletions parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"fmt"
"strconv"
"unicode"
"unicode/utf8"
)

// A XPath expression token type.
Expand Down Expand Up @@ -228,8 +229,9 @@ Loop:
}

// RelationalExpr ::= AdditiveExpr | RelationalExpr '<' AdditiveExpr | RelationalExpr '>' AdditiveExpr
// | RelationalExpr '<=' AdditiveExpr
// | RelationalExpr '>=' AdditiveExpr
//
// | RelationalExpr '<=' AdditiveExpr
// | RelationalExpr '>=' AdditiveExpr
func (p *parser) parseRelationalExpr(n node) node {
opnd := p.parseAdditiveExpr(n)
Loop:
Expand Down Expand Up @@ -274,7 +276,8 @@ Loop:
}

// MultiplicativeExpr ::= UnaryExpr | MultiplicativeExpr MultiplyOperator(*) UnaryExpr
// | MultiplicativeExpr 'div' UnaryExpr | MultiplicativeExpr 'mod' UnaryExpr
//
// | MultiplicativeExpr 'div' UnaryExpr | MultiplicativeExpr 'mod' UnaryExpr
func (p *parser) parseMultiplicativeExpr(n node) node {
opnd := p.parseUnaryExpr(n)
Loop:
Expand Down Expand Up @@ -308,7 +311,7 @@ func (p *parser) parseUnaryExpr(n node) node {
return opnd
}

// UnionExpr ::= PathExpr | UnionExpr '|' PathExpr
// UnionExpr ::= PathExpr | UnionExpr '|' PathExpr
func (p *parser) parseUnionExpr(n node) node {
opnd := p.parsePathExpr(n)
Loop:
Expand Down Expand Up @@ -352,7 +355,7 @@ func (p *parser) parseFilterExpr(n node) node {
return opnd
}

// Predicate ::= '[' PredicateExpr ']'
// Predicate ::= '[' PredicateExpr ']'
func (p *parser) parsePredicate(n node) node {
p.skipItem(itemLBracket)
opnd := p.parseExpression(n)
Expand Down Expand Up @@ -447,7 +450,7 @@ func (p *parser) parseSequence(n node) (opnd node) {
return opnd
}

// NodeTest ::= NameTest | nodeType '(' ')' | 'processing-instruction' '(' Literal ')'
// NodeTest ::= NameTest | nodeType '(' ')' | 'processing-instruction' '(' Literal ')'
func (p *parser) parseNodeTest(n node, axeTyp string) (opnd node) {
switch p.r.typ {
case itemName:
Expand Down Expand Up @@ -672,6 +675,7 @@ type scanner struct {

pos int
curr rune
currSize int
typ itemType
strval string // text value at current pos
numval float64 // number value at current pos
Expand All @@ -681,10 +685,18 @@ type scanner struct {
func (s *scanner) nextChar() bool {
if s.pos >= len(s.text) {
s.curr = rune(0)
s.currSize = 1
return false
}
s.curr = rune(s.text[s.pos])
s.pos++

r, size := rune(s.text[s.pos]), 1
if r >= 0x80 { // handle multi-byte runes
r, size = utf8.DecodeRuneInString(s.text[s.pos:])
}

s.curr = r
s.currSize = size
s.pos += size
return true
}

Expand Down Expand Up @@ -843,34 +855,45 @@ func (s *scanner) scanString() string {
end = s.curr
)
s.nextChar()
i := s.pos - 1
i := s.pos - s.currSize
if s.currSize > 1 {
c++
}
for s.curr != end {
if !s.nextChar() {
panic(errors.New("xpath: scanString got unclosed string"))
}
c++
c += s.currSize
}
s.nextChar()
return s.text[i : i+c]
}

func (s *scanner) scanName() string {
var (
c int
i = s.pos - 1
c = s.currSize - 1
i = s.pos - s.currSize
)

// Detect current rune size

for isName(s.curr) {
c++
if !s.nextChar() {
c += s.currSize
break
}
c += s.currSize
}
return s.text[i : i+c]
}

func isName(r rune) bool {
return string(r) != ":" && string(r) != "/" &&
(unicode.Is(first, r) || unicode.Is(second, r) || string(r) == "*")
(unicode.Is(first, r) ||
unicode.Is(second, r) ||
unicode.Is(cyrillic, r) ||
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why should we need add this new condition for cyrillic and greek. We already knows Non-English letters yet.

unicode.Is(greek, r) ||
string(r) == "*")
}

func isDigit(r rune) bool {
Expand Down Expand Up @@ -1218,3 +1241,19 @@ var second = &unicode.RangeTable{
{0x30FC, 0x30FE, 1},
},
}

var cyrillic = &unicode.RangeTable{
R16: []unicode.Range16{
{Lo: 0x0400, Hi: 0x04FF, Stride: 1}, // Cyrillic
{Lo: 0x0500, Hi: 0x052F, Stride: 1}, // Cyrillic Supplement
{Lo: 0x2DE0, Hi: 0x2DFF, Stride: 1}, // Cyrillic Extended-A
{Lo: 0xA640, Hi: 0xA69F, Stride: 1}, // Cyrillic Extended-B
},
}

var greek = &unicode.RangeTable{
R16: []unicode.Range16{
{Lo: 0x0370, Hi: 0x03FF, Stride: 1}, // Greek and Coptic
{Lo: 0x1F00, Hi: 0x1FFF, Stride: 1}, // Greek Extended
},
}
2 changes: 1 addition & 1 deletion xpath_axes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ func Test_self(t *testing.T) {
}

func Test_child(t *testing.T) {
test_xpath_elements(t, employee_example, `//child::employee/child::email`, 6, 11, 16)
test_xpath_elements(t, employee_example, `/empinfo/child::*`, 3, 8, 13)
test_xpath_elements(t, employee_example, `/empinfo/child::node()`, 3, 8, 13)
test_xpath_values(t, employee_example, `//name/child::text()`, "Opal Kole", "Max Miller", "Beccaa Moss")
test_xpath_elements(t, employee_example, `//child::employee/child::email`, 6, 11, 16)
}

func Test_descendant(t *testing.T) {
Expand Down
49 changes: 49 additions & 0 deletions xpath_expression_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,52 @@ func TestSequence(t *testing.T) {
test_xpath_count(t, html_example, `//body/(h1, h2, p)`, 2)
test_xpath_count(t, html_example, `//body/(h1, h2, p, ..)`, 3)
}

func TestLatinAttributesInXPath(t *testing.T) {
doc := createNode("", RootNode)
div := doc.createChildNode("div", ElementNode)
div.addAttribute("language", "english")
div.lines = 1
test_xpath_elements(t, doc, `//div[@language='english']`, 1)
}

func TestCyrillicAttributesInXPath(t *testing.T) {
doc := createNode("", RootNode)
div := doc.createChildNode("div", ElementNode)
div.addAttribute("язык", "русский")
div.lines = 1
test_xpath_elements(t, doc, `//div[@язык='русский']`, 1)
}

func TestGreekAttributesInXPath(t *testing.T) {
doc := createNode("", RootNode)
div := doc.createChildNode("div", ElementNode)
div.addAttribute("γλώσσα", "ελληνικά")
div.lines = 1
test_xpath_elements(t, doc, `//div[@γλώσσα='ελληνικά']`, 1)
}

func TestCyrillicAndGreekAttributesMixedInXPath(t *testing.T) {
doc := createNode("", RootNode)
div := doc.createChildNode("div", ElementNode)
div.addAttribute("язык", "русский")
div.addAttribute("γλώσσα", "ελληνικά")
div.lines = 1
test_xpath_elements(t, doc, `//div[@язык='русский' and @γλώσσα='ελληνικά']`, 1)
}

func TestCyrillicAttributesInXPath_NoMatch(t *testing.T) {
doc := createNode("", RootNode)
div := doc.createChildNode("div", ElementNode)
div.addAttribute("язык", "русский")
div.lines = 1
test_xpath_elements(t, doc, `//div[@язык='английский']`)
}

func TestGreekAttributesInXPath_NoMatch(t *testing.T) {
doc := createNode("", RootNode)
div := doc.createChildNode("div", ElementNode)
div.addAttribute("γλώσσα", "ελληνικά")
div.lines = 1
test_xpath_elements(t, doc, `//div[@γλώσσα='αγγλικά']`)
}