path.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582
  1. // Copyright 2015-2019 Brett Vickers.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package etree
  5. import (
  6. "strconv"
  7. "strings"
  8. )
  9. /*
  10. A Path is a string that represents a search path through an etree starting
  11. from the document root or an arbitrary element. Paths are used with the
  12. Element object's Find* methods to locate and return desired elements.
  13. A Path consists of a series of slash-separated "selectors", each of which may
  14. be modified by one or more bracket-enclosed "filters". Selectors are used to
  15. traverse the etree from element to element, while filters are used to narrow
  16. the list of candidate elements at each node.
  17. Although etree Path strings are similar to XPath strings
  18. (https://www.w3.org/TR/1999/REC-xpath-19991116/), they have a more limited set
  19. of selectors and filtering options.
  20. The following selectors are supported by etree Path strings:
  21. . Select the current element.
  22. .. Select the parent of the current element.
  23. * Select all child elements of the current element.
  24. / Select the root element when used at the start of a path.
  25. // Select all descendants of the current element.
  26. tag Select all child elements with a name matching the tag.
  27. The following basic filters are supported by etree Path strings:
  28. [@attrib] Keep elements with an attribute named attrib.
  29. [@attrib='val'] Keep elements with an attribute named attrib and value matching val.
  30. [tag] Keep elements with a child element named tag.
  31. [tag='val'] Keep elements with a child element named tag and text matching val.
  32. [n] Keep the n-th element, where n is a numeric index starting from 1.
  33. The following function filters are also supported:
  34. [text()] Keep elements with non-empty text.
  35. [text()='val'] Keep elements whose text matches val.
  36. [local-name()='val'] Keep elements whose un-prefixed tag matches val.
  37. [name()='val'] Keep elements whose full tag exactly matches val.
  38. [namespace-prefix()='val'] Keep elements whose namespace prefix matches val.
  39. [namespace-uri()='val'] Keep elements whose namespace URI matches val.
  40. Here are some examples of Path strings:
  41. - Select the bookstore child element of the root element:
  42. /bookstore
  43. - Beginning from the root element, select the title elements of all
  44. descendant book elements having a 'category' attribute of 'WEB':
  45. //book[@category='WEB']/title
  46. - Beginning from the current element, select the first descendant
  47. book element with a title child element containing the text 'Great
  48. Expectations':
  49. .//book[title='Great Expectations'][1]
  50. - Beginning from the current element, select all child elements of
  51. book elements with an attribute 'language' set to 'english':
  52. ./book/*[@language='english']
  53. - Beginning from the current element, select all child elements of
  54. book elements containing the text 'special':
  55. ./book/*[text()='special']
  56. - Beginning from the current element, select all descendant book
  57. elements whose title child element has a 'language' attribute of 'french':
  58. .//book/title[@language='french']/..
  59. - Beginning from the current element, select all book elements
  60. belonging to the http://www.w3.org/TR/html4/ namespace:
  61. .//book[namespace-uri()='http://www.w3.org/TR/html4/']
  62. */
  63. type Path struct {
  64. segments []segment
  65. }
  66. // ErrPath is returned by path functions when an invalid etree path is provided.
  67. type ErrPath string
  68. // Error returns the string describing a path error.
  69. func (err ErrPath) Error() string {
  70. return "etree: " + string(err)
  71. }
  72. // CompilePath creates an optimized version of an XPath-like string that
  73. // can be used to query elements in an element tree.
  74. func CompilePath(path string) (Path, error) {
  75. var comp compiler
  76. segments := comp.parsePath(path)
  77. if comp.err != ErrPath("") {
  78. return Path{nil}, comp.err
  79. }
  80. return Path{segments}, nil
  81. }
  82. // MustCompilePath creates an optimized version of an XPath-like string that
  83. // can be used to query elements in an element tree. Panics if an error
  84. // occurs. Use this function to create Paths when you know the path is
  85. // valid (i.e., if it's hard-coded).
  86. func MustCompilePath(path string) Path {
  87. p, err := CompilePath(path)
  88. if err != nil {
  89. panic(err)
  90. }
  91. return p
  92. }
  93. // A segment is a portion of a path between "/" characters.
  94. // It contains one selector and zero or more [filters].
  95. type segment struct {
  96. sel selector
  97. filters []filter
  98. }
  99. func (seg *segment) apply(e *Element, p *pather) {
  100. seg.sel.apply(e, p)
  101. for _, f := range seg.filters {
  102. f.apply(p)
  103. }
  104. }
  105. // A selector selects XML elements for consideration by the
  106. // path traversal.
  107. type selector interface {
  108. apply(e *Element, p *pather)
  109. }
  110. // A filter pares down a list of candidate XML elements based
  111. // on a path filter in [brackets].
  112. type filter interface {
  113. apply(p *pather)
  114. }
  115. // A pather is helper object that traverses an element tree using
  116. // a Path object. It collects and deduplicates all elements matching
  117. // the path query.
  118. type pather struct {
  119. queue fifo
  120. results []*Element
  121. inResults map[*Element]bool
  122. candidates []*Element
  123. scratch []*Element // used by filters
  124. }
  125. // A node represents an element and the remaining path segments that
  126. // should be applied against it by the pather.
  127. type node struct {
  128. e *Element
  129. segments []segment
  130. }
  131. func newPather() *pather {
  132. return &pather{
  133. results: make([]*Element, 0),
  134. inResults: make(map[*Element]bool),
  135. candidates: make([]*Element, 0),
  136. scratch: make([]*Element, 0),
  137. }
  138. }
  139. // traverse follows the path from the element e, collecting
  140. // and then returning all elements that match the path's selectors
  141. // and filters.
  142. func (p *pather) traverse(e *Element, path Path) []*Element {
  143. for p.queue.add(node{e, path.segments}); p.queue.len() > 0; {
  144. p.eval(p.queue.remove().(node))
  145. }
  146. return p.results
  147. }
  148. // eval evalutes the current path node by applying the remaining
  149. // path's selector rules against the node's element.
  150. func (p *pather) eval(n node) {
  151. p.candidates = p.candidates[0:0]
  152. seg, remain := n.segments[0], n.segments[1:]
  153. seg.apply(n.e, p)
  154. if len(remain) == 0 {
  155. for _, c := range p.candidates {
  156. if in := p.inResults[c]; !in {
  157. p.inResults[c] = true
  158. p.results = append(p.results, c)
  159. }
  160. }
  161. } else {
  162. for _, c := range p.candidates {
  163. p.queue.add(node{c, remain})
  164. }
  165. }
  166. }
  167. // A compiler generates a compiled path from a path string.
  168. type compiler struct {
  169. err ErrPath
  170. }
  171. // parsePath parses an XPath-like string describing a path
  172. // through an element tree and returns a slice of segment
  173. // descriptors.
  174. func (c *compiler) parsePath(path string) []segment {
  175. // If path ends with //, fix it
  176. if strings.HasSuffix(path, "//") {
  177. path = path + "*"
  178. }
  179. var segments []segment
  180. // Check for an absolute path
  181. if strings.HasPrefix(path, "/") {
  182. segments = append(segments, segment{new(selectRoot), []filter{}})
  183. path = path[1:]
  184. }
  185. // Split path into segments
  186. for _, s := range splitPath(path) {
  187. segments = append(segments, c.parseSegment(s))
  188. if c.err != ErrPath("") {
  189. break
  190. }
  191. }
  192. return segments
  193. }
  194. func splitPath(path string) []string {
  195. pieces := make([]string, 0)
  196. start := 0
  197. inquote := false
  198. for i := 0; i+1 <= len(path); i++ {
  199. if path[i] == '\'' {
  200. inquote = !inquote
  201. } else if path[i] == '/' && !inquote {
  202. pieces = append(pieces, path[start:i])
  203. start = i + 1
  204. }
  205. }
  206. return append(pieces, path[start:])
  207. }
  208. // parseSegment parses a path segment between / characters.
  209. func (c *compiler) parseSegment(path string) segment {
  210. pieces := strings.Split(path, "[")
  211. seg := segment{
  212. sel: c.parseSelector(pieces[0]),
  213. filters: []filter{},
  214. }
  215. for i := 1; i < len(pieces); i++ {
  216. fpath := pieces[i]
  217. if fpath[len(fpath)-1] != ']' {
  218. c.err = ErrPath("path has invalid filter [brackets].")
  219. break
  220. }
  221. seg.filters = append(seg.filters, c.parseFilter(fpath[:len(fpath)-1]))
  222. }
  223. return seg
  224. }
  225. // parseSelector parses a selector at the start of a path segment.
  226. func (c *compiler) parseSelector(path string) selector {
  227. switch path {
  228. case ".":
  229. return new(selectSelf)
  230. case "..":
  231. return new(selectParent)
  232. case "*":
  233. return new(selectChildren)
  234. case "":
  235. return new(selectDescendants)
  236. default:
  237. return newSelectChildrenByTag(path)
  238. }
  239. }
  240. var fnTable = map[string]struct {
  241. hasFn func(e *Element) bool
  242. getValFn func(e *Element) string
  243. }{
  244. "local-name": {nil, (*Element).name},
  245. "name": {nil, (*Element).FullTag},
  246. "namespace-prefix": {nil, (*Element).namespacePrefix},
  247. "namespace-uri": {nil, (*Element).NamespaceURI},
  248. "text": {(*Element).hasText, (*Element).Text},
  249. }
  250. // parseFilter parses a path filter contained within [brackets].
  251. func (c *compiler) parseFilter(path string) filter {
  252. if len(path) == 0 {
  253. c.err = ErrPath("path contains an empty filter expression.")
  254. return nil
  255. }
  256. // Filter contains [@attr='val'], [fn()='val'], or [tag='val']?
  257. eqindex := strings.Index(path, "='")
  258. if eqindex >= 0 {
  259. rindex := nextIndex(path, "'", eqindex+2)
  260. if rindex != len(path)-1 {
  261. c.err = ErrPath("path has mismatched filter quotes.")
  262. return nil
  263. }
  264. key := path[:eqindex]
  265. value := path[eqindex+2 : rindex]
  266. switch {
  267. case key[0] == '@':
  268. return newFilterAttrVal(key[1:], value)
  269. case strings.HasSuffix(key, "()"):
  270. fn := key[:len(key)-2]
  271. if t, ok := fnTable[fn]; ok && t.getValFn != nil {
  272. return newFilterFuncVal(t.getValFn, value)
  273. }
  274. c.err = ErrPath("path has unknown function " + fn)
  275. return nil
  276. default:
  277. return newFilterChildText(key, value)
  278. }
  279. }
  280. // Filter contains [@attr], [N], [tag] or [fn()]
  281. switch {
  282. case path[0] == '@':
  283. return newFilterAttr(path[1:])
  284. case strings.HasSuffix(path, "()"):
  285. fn := path[:len(path)-2]
  286. if t, ok := fnTable[fn]; ok && t.hasFn != nil {
  287. return newFilterFunc(t.hasFn)
  288. }
  289. c.err = ErrPath("path has unknown function " + fn)
  290. return nil
  291. case isInteger(path):
  292. pos, _ := strconv.Atoi(path)
  293. switch {
  294. case pos > 0:
  295. return newFilterPos(pos - 1)
  296. default:
  297. return newFilterPos(pos)
  298. }
  299. default:
  300. return newFilterChild(path)
  301. }
  302. }
  303. // selectSelf selects the current element into the candidate list.
  304. type selectSelf struct{}
  305. func (s *selectSelf) apply(e *Element, p *pather) {
  306. p.candidates = append(p.candidates, e)
  307. }
  308. // selectRoot selects the element's root node.
  309. type selectRoot struct{}
  310. func (s *selectRoot) apply(e *Element, p *pather) {
  311. root := e
  312. for root.parent != nil {
  313. root = root.parent
  314. }
  315. p.candidates = append(p.candidates, root)
  316. }
  317. // selectParent selects the element's parent into the candidate list.
  318. type selectParent struct{}
  319. func (s *selectParent) apply(e *Element, p *pather) {
  320. if e.parent != nil {
  321. p.candidates = append(p.candidates, e.parent)
  322. }
  323. }
  324. // selectChildren selects the element's child elements into the
  325. // candidate list.
  326. type selectChildren struct{}
  327. func (s *selectChildren) apply(e *Element, p *pather) {
  328. for _, c := range e.Child {
  329. if c, ok := c.(*Element); ok {
  330. p.candidates = append(p.candidates, c)
  331. }
  332. }
  333. }
  334. // selectDescendants selects all descendant child elements
  335. // of the element into the candidate list.
  336. type selectDescendants struct{}
  337. func (s *selectDescendants) apply(e *Element, p *pather) {
  338. var queue fifo
  339. for queue.add(e); queue.len() > 0; {
  340. e := queue.remove().(*Element)
  341. p.candidates = append(p.candidates, e)
  342. for _, c := range e.Child {
  343. if c, ok := c.(*Element); ok {
  344. queue.add(c)
  345. }
  346. }
  347. }
  348. }
  349. // selectChildrenByTag selects into the candidate list all child
  350. // elements of the element having the specified tag.
  351. type selectChildrenByTag struct {
  352. space, tag string
  353. }
  354. func newSelectChildrenByTag(path string) *selectChildrenByTag {
  355. s, l := spaceDecompose(path)
  356. return &selectChildrenByTag{s, l}
  357. }
  358. func (s *selectChildrenByTag) apply(e *Element, p *pather) {
  359. for _, c := range e.Child {
  360. if c, ok := c.(*Element); ok && spaceMatch(s.space, c.Space) && s.tag == c.Tag {
  361. p.candidates = append(p.candidates, c)
  362. }
  363. }
  364. }
  365. // filterPos filters the candidate list, keeping only the
  366. // candidate at the specified index.
  367. type filterPos struct {
  368. index int
  369. }
  370. func newFilterPos(pos int) *filterPos {
  371. return &filterPos{pos}
  372. }
  373. func (f *filterPos) apply(p *pather) {
  374. if f.index >= 0 {
  375. if f.index < len(p.candidates) {
  376. p.scratch = append(p.scratch, p.candidates[f.index])
  377. }
  378. } else {
  379. if -f.index <= len(p.candidates) {
  380. p.scratch = append(p.scratch, p.candidates[len(p.candidates)+f.index])
  381. }
  382. }
  383. p.candidates, p.scratch = p.scratch, p.candidates[0:0]
  384. }
  385. // filterAttr filters the candidate list for elements having
  386. // the specified attribute.
  387. type filterAttr struct {
  388. space, key string
  389. }
  390. func newFilterAttr(str string) *filterAttr {
  391. s, l := spaceDecompose(str)
  392. return &filterAttr{s, l}
  393. }
  394. func (f *filterAttr) apply(p *pather) {
  395. for _, c := range p.candidates {
  396. for _, a := range c.Attr {
  397. if spaceMatch(f.space, a.Space) && f.key == a.Key {
  398. p.scratch = append(p.scratch, c)
  399. break
  400. }
  401. }
  402. }
  403. p.candidates, p.scratch = p.scratch, p.candidates[0:0]
  404. }
  405. // filterAttrVal filters the candidate list for elements having
  406. // the specified attribute with the specified value.
  407. type filterAttrVal struct {
  408. space, key, val string
  409. }
  410. func newFilterAttrVal(str, value string) *filterAttrVal {
  411. s, l := spaceDecompose(str)
  412. return &filterAttrVal{s, l, value}
  413. }
  414. func (f *filterAttrVal) apply(p *pather) {
  415. for _, c := range p.candidates {
  416. for _, a := range c.Attr {
  417. if spaceMatch(f.space, a.Space) && f.key == a.Key && f.val == a.Value {
  418. p.scratch = append(p.scratch, c)
  419. break
  420. }
  421. }
  422. }
  423. p.candidates, p.scratch = p.scratch, p.candidates[0:0]
  424. }
  425. // filterFunc filters the candidate list for elements satisfying a custom
  426. // boolean function.
  427. type filterFunc struct {
  428. fn func(e *Element) bool
  429. }
  430. func newFilterFunc(fn func(e *Element) bool) *filterFunc {
  431. return &filterFunc{fn}
  432. }
  433. func (f *filterFunc) apply(p *pather) {
  434. for _, c := range p.candidates {
  435. if f.fn(c) {
  436. p.scratch = append(p.scratch, c)
  437. }
  438. }
  439. p.candidates, p.scratch = p.scratch, p.candidates[0:0]
  440. }
  441. // filterFuncVal filters the candidate list for elements containing a value
  442. // matching the result of a custom function.
  443. type filterFuncVal struct {
  444. fn func(e *Element) string
  445. val string
  446. }
  447. func newFilterFuncVal(fn func(e *Element) string, value string) *filterFuncVal {
  448. return &filterFuncVal{fn, value}
  449. }
  450. func (f *filterFuncVal) apply(p *pather) {
  451. for _, c := range p.candidates {
  452. if f.fn(c) == f.val {
  453. p.scratch = append(p.scratch, c)
  454. }
  455. }
  456. p.candidates, p.scratch = p.scratch, p.candidates[0:0]
  457. }
  458. // filterChild filters the candidate list for elements having
  459. // a child element with the specified tag.
  460. type filterChild struct {
  461. space, tag string
  462. }
  463. func newFilterChild(str string) *filterChild {
  464. s, l := spaceDecompose(str)
  465. return &filterChild{s, l}
  466. }
  467. func (f *filterChild) apply(p *pather) {
  468. for _, c := range p.candidates {
  469. for _, cc := range c.Child {
  470. if cc, ok := cc.(*Element); ok &&
  471. spaceMatch(f.space, cc.Space) &&
  472. f.tag == cc.Tag {
  473. p.scratch = append(p.scratch, c)
  474. }
  475. }
  476. }
  477. p.candidates, p.scratch = p.scratch, p.candidates[0:0]
  478. }
  479. // filterChildText filters the candidate list for elements having
  480. // a child element with the specified tag and text.
  481. type filterChildText struct {
  482. space, tag, text string
  483. }
  484. func newFilterChildText(str, text string) *filterChildText {
  485. s, l := spaceDecompose(str)
  486. return &filterChildText{s, l, text}
  487. }
  488. func (f *filterChildText) apply(p *pather) {
  489. for _, c := range p.candidates {
  490. for _, cc := range c.Child {
  491. if cc, ok := cc.(*Element); ok &&
  492. spaceMatch(f.space, cc.Space) &&
  493. f.tag == cc.Tag &&
  494. f.text == cc.Text() {
  495. p.scratch = append(p.scratch, c)
  496. }
  497. }
  498. }
  499. p.candidates, p.scratch = p.scratch, p.candidates[0:0]
  500. }