View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2009-2019, University of Amsterdam
    7                              VU University Amsterdam
    8                              CWI, Amsterdam
    9    All rights reserved.
   10
   11    Redistribution and use in source and binary forms, with or without
   12    modification, are permitted provided that the following conditions
   13    are met:
   14
   15    1. Redistributions of source code must retain the above copyright
   16       notice, this list of conditions and the following disclaimer.
   17
   18    2. Redistributions in binary form must reproduce the above copyright
   19       notice, this list of conditions and the following disclaimer in
   20       the documentation and/or other materials provided with the
   21       distribution.
   22
   23    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   24    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   25    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   26    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   27    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   28    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   29    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   30    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   31    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   33    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   34    POSSIBILITY OF SUCH DAMAGE.
   35*/
   36
   37:- module(xpath,
   38          [ xpath/3,                    % +DOM, +Spec, -Value
   39            xpath_chk/3,                % +DOM, +Spec, -Value
   40
   41            op(400, fx, //),
   42            op(400, fx, /),
   43            op(200, fy, @)
   44          ]).   45:- use_module(library(record)).   46:- use_module(library(lists)).   47:- use_module(library(debug)).   48:- use_module(library(error)).   49:- use_module(library(sgml)).   50
   51/** <module> Select nodes in an XML DOM
   52
   53The library xpath.pl provides predicates to select nodes from an XML DOM
   54tree as produced by library(sgml) based  on descriptions inspired by the
   55XPath language.
   56
   57The   predicate   xpath/3   selects   a   sub-structure   of   the   DOM
   58non-deterministically based on an  XPath-like   specification.  Not  all
   59selectors of XPath are implemented, but the ability to mix xpath/3 calls
   60with arbitrary Prolog code  provides  a   powerful  tool  for extracting
   61information from XML parse-trees.
   62
   63@see http://www.w3.org/TR/xpath
   64*/
   65
   66:- record
   67    element(name, attributes, content).   68
   69%!  xpath_chk(+DOM, +Spec, ?Content) is semidet.
   70%
   71%   Semi-deterministic version of xpath/3.
   72
   73xpath_chk(DOM, Spec, Content) :-
   74    xpath(DOM, Spec, Content),
   75    !.
   76
   77%!  xpath(+DOM, +Spec, ?Content) is nondet.
   78%
   79%   Match an element in a DOM structure.   The syntax is inspired by
   80%   XPath, using () rather than  []   to  select  inside an element.
   81%   First we can construct paths using / and //:
   82%
   83%       $ =|//|=Term :
   84%       Select any node in the DOM matching term.
   85%       $ =|/|=Term :
   86%       Match the root against Term.
   87%       $ Term :
   88%       Select the immediate children of the root matching Term.
   89%
   90%   The Terms above are of type   _callable_.  The functor specifies
   91%   the element name. The element name   '*'  refers to any element.
   92%   The name =self= refers to the   top-element  itself and is often
   93%   used for processing matches of an  earlier xpath/3 query. A term
   94%   NS:Term refers to an XML  name   in  the  namespace NS. Optional
   95%   arguments specify additional  constraints   and  functions.  The
   96%   arguments are processed from left  to right. Defined conditional
   97%   argument values are:
   98%
   99%       $ index(?Index) :
  100%       True if the element is the Index-th child of its parent,
  101%       where 1 denotes the first child. Index can be one of:
  102%         $ `Var` :
  103%         `Var` is unified with the index of the matched element.
  104%         $ =last= :
  105%         True for the last element.
  106%         $ =last= - `IntExpr` :
  107%         True for the last-minus-nth element. For example,
  108%         `last-1` is the element directly preceding the last one.
  109%         $ `IntExpr` :
  110%         True for the element whose index equals `IntExpr`.
  111%       $ Integer :
  112%       The N-th element with the given name, with 1 denoting the
  113%       first element. Same as index(Integer).
  114%       $ =last= :
  115%       The last element with the given name. Same as
  116%       index(last).
  117%       $ =last= - IntExpr :
  118%       The IntExpr-th element before the last.
  119%       Same as index(last-IntExpr).
  120%
  121%   Defined function argument values are:
  122%
  123%       $ =self= :
  124%       Evaluate to the entire element
  125%       $ =content= :
  126%       Evaluate to the content of the element (a list)
  127%       $ =text= :
  128%       Evaluates to all text from the sub-tree as an atom
  129%       $ `text(As)` :
  130%       Evaluates to all text from the sub-tree according to
  131%       `As`, which is either `atom` or `string`.
  132%       $ =normalize_space= :
  133%       As =text=, but uses normalize_space/2 to normalise
  134%       white-space in the output
  135%       $ =number= :
  136%       Extract an integer or float from the value.  Ignores
  137%       leading and trailing white-space
  138%       $ =|@|=Attribute :
  139%       Evaluates to the value of the given attribute.  Attribute
  140%       can be a compound term. In this case the functor name
  141%       denotes the element and arguments perform transformations
  142%       on the attribute value.  Defined transformations are:
  143%
  144%         - number
  145%         Translate the value into a number using
  146%         xsd_number_string/2 from library(sgml).
  147%         - integer
  148%         As `number`, but subsequently transform the value
  149%         into an integer using the round/1 function.
  150%         - float
  151%         As `number`, but subsequently transform the value
  152%         into a float using the float/1 function.
  153%         - string
  154%         Translate the value into a Prolog string.
  155%         - lower
  156%         Translate the value to lower case, preserving
  157%         the type.
  158%         - upper
  159%         Translate the value to upper case, preserving
  160%         the type.
  161%
  162%   In addition, the argument-list can be _conditions_:
  163%
  164%       $ Left = Right :
  165%       Succeeds if the left-hand unifies with the right-hand.
  166%       If the left-hand side is a function, this is evaluated.
  167%       The right-hand side is _never_ evaluated, and thus the
  168%       condition `content = content` defines that the content
  169%       of the element is the atom `content`.
  170%       The functions `lower_case` and `upper_case` can be applied
  171%       to Right (see example below).
  172%       $ contains(Haystack, Needle) :
  173%       Succeeds if Needle is a sub-string of Haystack.
  174%       $ XPath :
  175%       Succeeds if XPath matches in the currently selected
  176%       sub-DOM.  For example, the following expression finds
  177%       an =h3= element inside a =div= element, where the =div=
  178%       element itself contains an =h2= child with a =strong=
  179%       child.
  180%
  181%         ==
  182%         //div(h2/strong)/h3
  183%         ==
  184%
  185%       This is equivalent to the conjunction of XPath goals below.
  186%
  187%         ==
  188%            ...,
  189%            xpath(DOM, //(div), Div),
  190%            xpath(Div, h2/strong, _),
  191%            xpath(Div, h3, Result)
  192%         ==
  193%
  194%   **Examples**:
  195%
  196%   Match each table-row in DOM:
  197%
  198%       ==
  199%       xpath(DOM, //tr, TR)
  200%       ==
  201%
  202%   Match the last cell  of  each   tablerow  in  DOM.  This example
  203%   illustrates that a result can be the input of subsequent xpath/3
  204%   queries. Using multiple queries  on   the  intermediate  TR term
  205%   guarantee that all results come from the same table-row:
  206%
  207%       ==
  208%       xpath(DOM, //tr, TR),
  209%       xpath(TR,  /td(last), TD)
  210%       ==
  211%
  212%   Match each =href= attribute in an <a> element
  213%
  214%       ==
  215%       xpath(DOM, //a(@href), HREF)
  216%       ==
  217%
  218%   Suppose we have a table containing  rows where each first column
  219%   is the name of a product with a   link to details and the second
  220%   is the price (a number).  The   following  predicate matches the
  221%   name, URL and price:
  222%
  223%       ==
  224%       product(DOM, Name, URL, Price) :-
  225%           xpath(DOM, //tr, TR),
  226%           xpath(TR, td(1), C1),
  227%           xpath(C1, /self(normalize_space), Name),
  228%           xpath(C1, a(@href), URL),
  229%           xpath(TR, td(2, number), Price).
  230%       ==
  231%
  232%   Suppose we want to select  books   with  genre="thriller" from a
  233%   tree containing elements =|<book genre=...>|=
  234%
  235%       ==
  236%       thriller(DOM, Book) :-
  237%           xpath(DOM, //book(@genre=thiller), Book).
  238%       ==
  239%
  240%   Match the elements =|<table align="center">|= _and_ =|<table
  241%   align="CENTER">|=:
  242%
  243%       ```prolog
  244%           //table(@align(lower) = center)
  245%       ```
  246%
  247%   Get the `width` and `height` of a `div` element as a number,
  248%   and the `div` node itself:
  249%
  250%       ==
  251%           xpath(DOM, //div(@width(number)=W, @height(number)=H), Div)
  252%       ==
  253%
  254%   Note that `div` is an infix operator, so parentheses must be
  255%   used in cases like the following:
  256%
  257%       ==
  258%           xpath(DOM, //(div), Div)
  259%       ==
  260
  261xpath(DOM, Spec, Content) :-
  262    in_dom(Spec, DOM, Content).
  263
  264in_dom(//Spec, DOM, Value) :-
  265    !,
  266    element_spec(Spec, Name, Modifiers),
  267    sub_dom(I, Len, Name, E, DOM),
  268    modifiers(Modifiers, I, Len, E, Value).
  269in_dom(/Spec, E, Value) :-
  270    !,
  271    element_spec(Spec, Name, Modifiers),
  272    (   Name == self
  273    ->  true
  274    ;   element_name(E, Name)
  275    ),
  276    modifiers(Modifiers, 1, 1, E, Value).
  277in_dom(A/B, DOM, Value) :-
  278    !,
  279    in_dom(A, DOM, Value0),
  280    in_dom(B, Value0, Value).
  281in_dom(A//B, DOM, Value) :-
  282    !,
  283    in_dom(A, DOM, Value0),
  284    in_dom(//B, Value0, Value).
  285in_dom(Spec, element(_, _, Content), Value) :-
  286    element_spec(Spec, Name, Modifiers),
  287    count_named_elements(Content, Name, CLen),
  288    CLen > 0,
  289    nth_element(N, Name, E, Content),
  290    modifiers(Modifiers, N, CLen, E, Value).
  291
  292element_spec(Var, _, _) :-
  293    var(Var),
  294    !,
  295    instantiation_error(Var).
  296element_spec(NS:Term, NS:Name, Modifiers) :-
  297    !,
  298    callable_name_arguments(Term, Name0, Modifiers),
  299    star(Name0, Name).
  300element_spec(Term, Name, Modifiers) :-
  301    !,
  302    callable_name_arguments(Term, Name0, Modifiers),
  303    star(Name0, Name).
  304
  305callable_name_arguments(Atom, Name, Arguments) :-
  306    atom(Atom),
  307    !,
  308    Name = Atom, Arguments = [].
  309callable_name_arguments(Compound, Name, Arguments) :-
  310    compound_name_arguments(Compound, Name, Arguments).
  311
  312
  313star(*, _) :- !.
  314star(Name, Name).
  315
  316
  317%!  sub_dom(-Index, -Count, +Name, -Sub, +DOM) is nondet.
  318%
  319%   Sub is a node in DOM with Name.
  320%
  321%   @param Count    is the total number of nodes in the content
  322%                   list Sub appears that have the same name.
  323%   @param Index    is the 1-based index of Sub of nodes with
  324%                   Name.
  325
  326sub_dom(1, 1, Name, DOM, DOM) :-
  327    element_name(DOM, Name0),
  328    \+ Name \= Name0.
  329sub_dom(N, Len, Name, E, element(_,_,Content)) :-
  330    !,
  331    sub_dom_2(N, Len, Name, E, Content).
  332sub_dom(N, Len, Name, E, Content) :-
  333    is_list(Content),
  334    sub_dom_2(N, Len, Name, E, Content).
  335
  336sub_dom_2(N, Len, Name, Element, Content) :-
  337    (   count_named_elements(Content, Name, Len),
  338        nth_element(N, Name, Element, Content)
  339    ;   member(element(_,_,C2), Content),
  340        sub_dom_2(N, Len, Name, Element, C2)
  341    ).
  342
  343
  344%!  count_named_elements(+Content, +Name, -Count) is det.
  345%
  346%   Count is the number of nodes with Name in Content.
  347
  348count_named_elements(Content, Name, Count) :-
  349    count_named_elements(Content, Name, 0, Count).
  350
  351count_named_elements([], _, Count, Count).
  352count_named_elements([element(Name,_,_)|T], Name0, C0, C) :-
  353    \+ Name \= Name0,
  354    !,
  355    C1 is C0+1,
  356    count_named_elements(T, Name0, C1, C).
  357count_named_elements([_|T], Name, C0, C) :-
  358    count_named_elements(T, Name, C0, C).
  359
  360
  361%!  nth_element(?N, +Name, -Element, +Content:list) is nondet.
  362%
  363%   True if Element is the N-th element with name in Content.
  364
  365nth_element(N, Name, Element, Content) :-
  366    nth_element_(1, N, Name, Element, Content).
  367
  368nth_element_(I, N, Name, E, [H|T]) :-
  369    element_name(H, Name0),
  370    \+ Name \= Name0,
  371    !,
  372    (   N = I,
  373        E = H
  374    ;   I2 is I + 1,
  375        (   nonvar(N), I2 > N
  376        ->  !, fail
  377        ;   true
  378        ),
  379        nth_element_(I2, N, Name, E, T)
  380    ).
  381nth_element_(I, N, Name, E, [_|T]) :-
  382    nth_element_(I, N, Name, E, T).
  383
  384
  385%!  modifiers(+Modifiers, +I, +Clen, +DOM, -Value)
  386%
  387%
  388
  389modifiers([], _, _, Value, Value).
  390modifiers([H|T], I, L, Value0, Value) :-
  391    modifier(H, I, L, Value0, Value1),
  392    modifiers(T, I, L, Value1, Value).
  393
  394modifier(M, _, _, _, _) :-
  395    var(M),
  396    !,
  397    instantiation_error(M).
  398modifier(Index, I, L, Value0, Value) :-
  399    implicit_index_modifier(Index),
  400    !,
  401    Value = Value0,
  402    index_modifier(Index, I, L).
  403modifier(index(Index), I, L, Value, Value) :-
  404    !,
  405    index_modifier(Index, I, L).
  406modifier(Function, _, _, In, Out) :-
  407    xpath_function(Function),
  408    !,
  409    xpath_function(Function, In, Out).
  410modifier(Function, _, _, In, Out) :-
  411    xpath_condition(Function, In),
  412    Out = In.
  413
  414implicit_index_modifier(I) :-
  415    integer(I),
  416    !.
  417implicit_index_modifier(last).
  418implicit_index_modifier(last-_Expr).
  419
  420index_modifier(Var, I, _L) :-
  421    var(Var),
  422    !,
  423    Var = I.
  424index_modifier(last, I, L) :-
  425    !,
  426    I =:= L.
  427index_modifier(last-Expr, I, L) :-
  428    !,
  429    I =:= L-Expr.
  430index_modifier(N, I, _) :-
  431    N =:= I.
  432
  433xpath_function(self, DOM, Value) :-                            % self
  434    !,
  435    Value = DOM.
  436xpath_function(content, Element, Value) :-                     % content
  437    !,
  438    element_content(Element, Value).
  439xpath_function(text, DOM, Text) :-                             % text
  440    !,
  441    text_of_dom(DOM, atom, Text).
  442xpath_function(text(As), DOM, Text) :-                         % text(As)
  443    !,
  444    text_of_dom(DOM, As, Text).
  445xpath_function(normalize_space, DOM, Text) :-                  % normalize_space
  446    !,
  447    text_of_dom(DOM, string, Text0),
  448    normalize_space(atom(Text), Text0).
  449xpath_function(number, DOM, Number) :-                         % number
  450    !,
  451    text_of_dom(DOM, string, Text0),
  452    normalize_space(string(Text), Text0),
  453    catch(xsd_number_string(Number, Text), _, fail).
  454xpath_function(@Name, element(_, Attrs, _), Value) :-          % @Name
  455    !,
  456    (   atom(Name)
  457    ->  memberchk(Name=Value, Attrs)
  458    ;   compound(Name)
  459    ->  compound_name_arguments(Name, AName, AOps),
  460        memberchk(AName=Value0, Attrs),
  461        translate_attribute(AOps, Value0, Value)
  462    ;   member(Name=Value, Attrs)
  463    ).
  464xpath_function(quote(Value), _, Value).                         % quote(Value)
  465
  466xpath_function(self).
  467xpath_function(content).
  468xpath_function(text).
  469xpath_function(text(_)).
  470xpath_function(normalize_space).
  471xpath_function(number).
  472xpath_function(@_).
  473xpath_function(quote(_)).
  474
  475translate_attribute([], Value, Value).
  476translate_attribute([H|T], Value0, Value) :-
  477    translate_attr(H, Value0, Value1),
  478    translate_attribute(T, Value1, Value).
  479
  480translate_attr(number, Value0, Value) :-
  481    xsd_number_string(Value, Value0).
  482translate_attr(integer, Value0, Value) :-
  483    xsd_number_string(Value1, Value0),
  484    Value is round(Value1).
  485translate_attr(float, Value0, Value) :-
  486    xsd_number_string(Value1, Value0),
  487    Value is float(Value1).
  488translate_attr(string, Value0, Value) :-
  489    atom_string(Value0, Value).
  490translate_attr(lower, Value0, Value) :-
  491    (   atom(Value0)
  492    ->  downcase_atom(Value0, Value)
  493    ;   string_lower(Value0, Value)
  494    ).
  495translate_attr(upper, Value0, Value) :-
  496    (   atom(Value0)
  497    ->  upcase_atom(Value0, Value)
  498    ;   string_upper(Value0, Value)
  499    ).
  500
  501xpath_condition(Left = Right, Value) :-                        % =
  502    !,
  503    var_or_function(Left, Value, LeftValue),
  504    process_equality(LeftValue, Right).
  505xpath_condition(contains(Haystack, Needle), Value) :-          % contains(Haystack, Needle)
  506    !,
  507    val_or_function(Haystack, Value, HaystackValue),
  508    val_or_function(Needle, Value, NeedleValue),
  509    atom(HaystackValue), atom(NeedleValue),
  510    (   sub_atom(HaystackValue, _, _, _, NeedleValue)
  511    ->  true
  512    ).
  513xpath_condition(Spec, Dom) :-
  514    in_dom(Spec, Dom, _).
  515
  516
  517%!  process_equality(+Left, +Right) is semidet.
  518%
  519%   Provides (very) partial support for XSLT   functions that can be
  520%   applied according to the XPath 2 specification.
  521%
  522%   For example the XPath expression  in   [1],  and  the equivalent
  523%   Prolog expression in [2], would both   match the HTML element in
  524%   [3].
  525%
  526%     ==
  527%     [1] //table[align=lower-case(center)]
  528%     [2] //table(@align=lower_case(center))
  529%     [3] <table align="CENTER">
  530%     ==
  531
  532process_equality(Left, Right) :-
  533    var(Right),
  534    !,
  535    Left = Right.
  536process_equality(Left, lower_case(Right)) :-
  537    !,
  538    downcase_atom(Left, Right).
  539process_equality(Left, upper_case(Right)) :-
  540    !,
  541    upcase_atom(Left, Right).
  542process_equality(Left, Right) :-
  543    Left = Right.
  544
  545
  546var_or_function(Arg, _, Arg) :-
  547    var(Arg),
  548    !.
  549var_or_function(Func, Value0, Value) :-
  550    xpath_function(Func),
  551    !,
  552    xpath_function(Func, Value0, Value).
  553var_or_function(Value, _, Value).
  554
  555val_or_function(Arg, _, Arg) :-
  556    var(Arg),
  557    !,
  558    instantiation_error(Arg).
  559val_or_function(Func, Value0, Value) :-                         % TBD
  560    xpath_function(Func, Value0, Value),
  561    !.
  562val_or_function(Value, _, Value).
  563
  564
  565%!  text_of_dom(+DOM, +As, -Text:atom) is det.
  566%
  567%   Text is the joined textual content of DOM.
  568
  569text_of_dom(DOM, As, Text) :-
  570    phrase(text_of(DOM), Tokens),
  571    (   As == atom
  572    ->  atomic_list_concat(Tokens, Text)
  573    ;   As == string
  574    ->  atomics_to_string(Tokens, Text)
  575    ;   must_be(oneof([atom,string]), As)
  576    ).
  577
  578text_of(element(_,_,Content)) -->
  579    text_of_list(Content).
  580text_of([]) -->
  581    [].
  582text_of([H|T]) -->
  583    text_of(H),
  584    text_of(T).
  585
  586
  587text_of_list([]) -->
  588    [].
  589text_of_list([H|T]) -->
  590    text_of_1(H),
  591    text_of_list(T).
  592
  593
  594text_of_1(element(_,_,Content)) -->
  595    !,
  596    text_of_list(Content).
  597text_of_1(Data) -->
  598    { assertion(atom_or_string(Data)) },
  599    [Data].
  600
  601atom_or_string(Data) :-
  602    (   atom(Data)
  603    ->  true
  604    ;   string(Data)
  605    )