View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2000-2018, University of Amsterdam
    7                              VU University Amsterdam
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(sgml,
   37          [ load_html/3,                % +Input, -DOM, +Options
   38            load_xml/3,                 % +Input, -DOM, +Options
   39            load_sgml/3,                % +Input, -DOM, +Options
   40
   41            load_sgml_file/2,           % +File, -ListOfContent
   42            load_xml_file/2,            % +File, -ListOfContent
   43            load_html_file/2,           % +File, -Document
   44
   45            load_structure/3,           % +File, -Term, +Options
   46
   47            load_dtd/2,                 % +DTD, +File
   48            load_dtd/3,                 % +DTD, +File, +Options
   49            dtd/2,                      % +Type, -DTD
   50            dtd_property/2,             % +DTD, ?Property
   51
   52            new_dtd/2,                  % +Doctype, -DTD
   53            free_dtd/1,                 % +DTD
   54            open_dtd/3,                 % +DTD, +Options, -Stream
   55
   56            new_sgml_parser/2,          % -Parser, +Options
   57            free_sgml_parser/1,         % +Parser
   58            set_sgml_parser/2,          % +Parser, +Options
   59            get_sgml_parser/2,          % +Parser, +Options
   60            sgml_parse/2,               % +Parser, +Options
   61
   62            sgml_register_catalog_file/2, % +File, +StartOrEnd
   63
   64            xml_quote_attribute/3,      % +In, -Quoted, +Encoding
   65            xml_quote_cdata/3,          % +In, -Quoted, +Encoding
   66            xml_quote_attribute/2,      % +In, -Quoted
   67            xml_quote_cdata/2,          % +In, -Quoted
   68            xml_name/1,                 % +In
   69            xml_name/2,                 % +In, +Encoding
   70
   71            xsd_number_string/2,        % ?Number, ?String
   72            xsd_time_string/3,          % ?Term, ?Type, ?String
   73
   74            xml_basechar/1,             % +Code
   75            xml_ideographic/1,          % +Code
   76            xml_combining_char/1,       % +Code
   77            xml_digit/1,                % +Code
   78            xml_extender/1,             % +Code
   79
   80            iri_xml_namespace/2,        % +IRI, -Namespace
   81            iri_xml_namespace/3,        % +IRI, -Namespace, -LocalName
   82            xml_is_dom/1                % +Term
   83          ]).   84:- use_module(library(lists)).   85:- use_module(library(option)).   86:- use_module(library(error)).   87:- use_module(library(iostream)).   88
   89:- meta_predicate
   90    load_structure(+, -, :),
   91    load_html(+, -, :),
   92    load_xml(+, -, :),
   93    load_sgml(+, -, :).   94
   95:- predicate_options(load_structure/3, 3,
   96                     [ charpos(integer),
   97                       cdata(oneof([atom,string])),
   98                       defaults(boolean),
   99                       dialect(oneof([html,html4,html5,sgml,xhtml,xhtml5,xml,xmlns])),
  100                       doctype(atom),
  101                       dtd(any),
  102                       encoding(oneof(['iso-8859-1', 'utf-8', 'us-ascii'])),
  103                       entity(atom,atom),
  104                       keep_prefix(boolean),
  105                       file(atom),
  106                       line(integer),
  107                       offset(integer),
  108                       number(oneof([token,integer])),
  109                       qualify_attributes(boolean),
  110                       shorttag(boolean),
  111                       case_sensitive_attributes(boolean),
  112                       case_preserving_attributes(boolean),
  113                       system_entities(boolean),
  114                       max_memory(integer),
  115                       space(oneof([sgml,preserve,default,remove])),
  116                       xmlns(atom),
  117                       xmlns(atom,atom),
  118                       pass_to(sgml_parse/2, 2)
  119                     ]).  120:- predicate_options(load_html/3, 3,
  121                     [ pass_to(load_structure/3, 3)
  122                     ]).  123:- predicate_options(load_xml/3, 3,
  124                     [ pass_to(load_structure/3, 3)
  125                     ]).  126:- predicate_options(load_sgml/3, 3,
  127                     [ pass_to(load_structure/3, 3)
  128                     ]).  129:- predicate_options(load_dtd/3, 3,
  130                     [ dialect(oneof([sgml,xml,xmlns])),
  131                       pass_to(open/4, 4)
  132                     ]).  133:- predicate_options(sgml_parse/2, 2,
  134                     [ call(oneof([begin,end,cdata,pi,decl,error,xmlns,urlns]),
  135                            callable),
  136                       cdata(oneof([atom,string])),
  137                       content_length(integer),
  138                       document(-any),
  139                       max_errors(integer),
  140                       parse(oneof([file,element,content,declaration,input])),
  141                       source(any),
  142                       syntax_errors(oneof([quiet,print,style])),
  143                       xml_no_ns(oneof([error,quiet]))
  144                     ]).  145:- predicate_options(new_sgml_parser/2, 2,
  146                     [ dtd(any)
  147                     ]).  148
  149
  150/** <module> SGML, XML and HTML parser
  151
  152This library allows you to parse SGML, XML   and HTML data into a Prolog
  153data structure. The library defines several families of predicates:
  154
  155  $ High-level predicates :
  156  Most users will only use load_html/3, load_xml/3 or load_sgml/3 to
  157  parse arbitrary input into a _DOM_ structure.  These predicates all
  158  call load_structure/3, which provides more options and may be
  159  used for processing non-standard documents.
  160
  161  The DOM structure can be used by library(xpath) to extract information
  162  from the document.
  163
  164  $ The low-level parser :
  165  The actual parser is written in C and consists of two parts: one for
  166  processing DTD (Document Type Definitions) and one for parsing data.
  167  The data can either be parsed to a Prolog (_DOM_) term or the parser
  168  can perform callbacks for the DOM _events_.
  169
  170  $ Utility predicates :
  171  Finally, this library provides prmitives for classifying characters
  172  and strings according to the XML specification such as xml_name/1 to
  173  verify whether an atom is a valid XML name (identifier).  It also
  174  provides primitives to quote attributes and CDATA elements.
  175*/
  176
  177:- multifile user:file_search_path/2.  178:- dynamic   user:file_search_path/2.  179
  180user:file_search_path(dtd, '.').
  181user:file_search_path(dtd, swi('library/DTD')).
  182
  183sgml_register_catalog_file(File, Location) :-
  184    prolog_to_os_filename(File, OsFile),
  185    '_sgml_register_catalog_file'(OsFile, Location).
  186
  187:- use_foreign_library(foreign(sgml2pl)).  188
  189register_catalog(Base) :-
  190    absolute_file_name(dtd(Base),
  191                           [ extensions([soc]),
  192                             access(read),
  193                             file_errors(fail)
  194                           ],
  195                           SocFile),
  196    sgml_register_catalog_file(SocFile, end).
  197
  198:- initialization
  199    ignore(register_catalog('HTML4')).  200
  201
  202                 /*******************************
  203                 *         DTD HANDLING         *
  204                 *******************************/
  205
  206/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  207Note that concurrent access to DTD objects  is not allowed, and hence we
  208will allocate and destroy them in each   thread.  Possibibly it would be
  209nicer to find out why  concurrent  access   to  DTD's  is  flawed. It is
  210diagnosed to mess with the entity resolution by Fabien Todescato.
  211- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  212
  213:- thread_local
  214    current_dtd/2.  215:- volatile
  216    current_dtd/2.  217:- thread_local
  218    registered_cleanup/0.  219:- volatile
  220    registered_cleanup/0.  221
  222:- multifile
  223    dtd_alias/2.  224
  225:- create_prolog_flag(html_dialect, html5, [type(atom)]).  226
  227dtd_alias(html4, 'HTML4').
  228dtd_alias(html5, 'HTML5').
  229dtd_alias(html,  DTD) :-
  230    current_prolog_flag(html_dialect, Dialect),
  231    dtd_alias(Dialect, DTD).
  232
  233%!  dtd(+Type, -DTD) is det.
  234%
  235%   DTD is a DTD object created from  the file dtd(Type). Loaded DTD
  236%   objects are cached. Note that  DTD   objects  may  not be shared
  237%   between threads. Therefore, dtd/2  maintains   the  pool  of DTD
  238%   objects  using  a  thread_local  predicate.    DTD  objects  are
  239%   destroyed if a thread terminates.
  240%
  241%   @error existence_error(source_sink, dtd(Type))
  242
  243dtd(Type, DTD) :-
  244    current_dtd(Type, DTD),
  245    !.
  246dtd(Type, DTD) :-
  247    new_dtd(Type, DTD),
  248    (   dtd_alias(Type, Base)
  249    ->  true
  250    ;   Base = Type
  251    ),
  252    absolute_file_name(dtd(Base),
  253                       [ extensions([dtd]),
  254                         access(read)
  255                       ], DtdFile),
  256    load_dtd(DTD, DtdFile),
  257    register_cleanup,
  258    asserta(current_dtd(Type, DTD)).
  259
  260%!  load_dtd(+DTD, +DtdFile, +Options)
  261%
  262%   Load DtdFile into a DTD.  Defined options are:
  263%
  264%           * dialect(+Dialect)
  265%           Dialect to use (xml, xmlns, sgml)
  266%
  267%           * encoding(+Encoding)
  268%           Encoding of DTD file
  269%
  270%   @param  DTD is a fresh DTD object, normally created using
  271%           new_dtd/1.
  272
  273load_dtd(DTD, DtdFile) :-
  274    load_dtd(DTD, DtdFile, []).
  275load_dtd(DTD, DtdFile, Options) :-
  276    sgml_open_options(sgml:Options, OpenOptions, sgml:DTDOptions),
  277    setup_call_cleanup(
  278        open_dtd(DTD, DTDOptions, DtdOut),
  279        setup_call_cleanup(
  280            open(DtdFile, read, DtdIn, OpenOptions),
  281            copy_stream_data(DtdIn, DtdOut),
  282            close(DtdIn)),
  283        close(DtdOut)).
  284
  285split_dtd_options([], [], []).
  286split_dtd_options([H|T], [H|TD], S) :-
  287    dtd_option(H),
  288    !,
  289    split_dtd_options(T, TD, S).
  290split_dtd_options([H|T], TD, [H|S]) :-
  291    split_dtd_options(T, TD, S).
  292
  293dtd_option(dialect(_)).
  294
  295
  296%!  destroy_dtds
  297%
  298%   Destroy  DTDs  cached  by  this  thread   as  they  will  become
  299%   unreachable anyway.
  300
  301destroy_dtds :-
  302    (   current_dtd(_Type, DTD),
  303        free_dtd(DTD),
  304        fail
  305    ;   true
  306    ).
  307
  308%!  register_cleanup
  309%
  310%   Register cleanup of DTDs created for this thread.
  311
  312register_cleanup :-
  313    registered_cleanup,
  314    !.
  315register_cleanup :-
  316    catch(thread_at_exit(destroy_dtds), _, true),
  317    assert(registered_cleanup).
  318
  319
  320                 /*******************************
  321                 *          EXAMINE DTD         *
  322                 *******************************/
  323
  324prop(doctype(_), _).
  325prop(elements(_), _).
  326prop(entities(_), _).
  327prop(notations(_), _).
  328prop(entity(E, _), DTD) :-
  329    (   nonvar(E)
  330    ->  true
  331    ;   '$dtd_property'(DTD, entities(EL)),
  332        member(E, EL)
  333    ).
  334prop(element(E, _, _), DTD) :-
  335    (   nonvar(E)
  336    ->  true
  337    ;   '$dtd_property'(DTD, elements(EL)),
  338        member(E, EL)
  339    ).
  340prop(attributes(E, _), DTD) :-
  341    (   nonvar(E)
  342    ->  true
  343    ;   '$dtd_property'(DTD, elements(EL)),
  344        member(E, EL)
  345    ).
  346prop(attribute(E, A, _, _), DTD) :-
  347    (   nonvar(E)
  348    ->  true
  349    ;   '$dtd_property'(DTD, elements(EL)),
  350        member(E, EL)
  351    ),
  352    (   nonvar(A)
  353    ->  true
  354    ;   '$dtd_property'(DTD, attributes(E, AL)),
  355        member(A, AL)
  356    ).
  357prop(notation(N, _), DTD) :-
  358    (   nonvar(N)
  359    ->  true
  360    ;   '$dtd_property'(DTD, notations(NL)),
  361        member(N, NL)
  362    ).
  363
  364dtd_property(DTD, Prop) :-
  365    prop(Prop, DTD),
  366    '$dtd_property'(DTD, Prop).
  367
  368
  369                 /*******************************
  370                 *             SGML             *
  371                 *******************************/
  372
  373%!  load_structure(+Source, -ListOfContent, :Options) is det.
  374%
  375%   Parse   Source   and   return   the   resulting   structure   in
  376%   ListOfContent. Source is handed to  open_any/5, which allows for
  377%   processing an extensible set of input sources.
  378%
  379%   A proper XML document contains only   a  single toplevel element
  380%   whose name matches the document type.   Nevertheless,  a list is
  381%   returned for consistency with  the   representation  of  element
  382%   content.
  383%
  384%   The  encoding(+Encoding)  option   is    treated   special   for
  385%   compatibility reasons:
  386%
  387%     - If `Encoding` is one of =iso-8859-1=, =us-ascii= or =utf-8=,
  388%       the stream is opened in binary mode and the option is passed
  389%       to the SGML parser.
  390%     - If `Encoding` is present, but not one of the above, the
  391%       stream is opened in text mode using the given encoding.
  392%     - Otherwise (no `Encoding`), the stream is opened in binary
  393%       mode and doing the correct decoding is left to the parser.
  394
  395load_structure(Spec, DOM, Options) :-
  396    sgml_open_options(Options, OpenOptions, SGMLOptions),
  397    setup_call_cleanup(
  398        open_any(Spec, read, In, Close, OpenOptions),
  399        load_structure_from_stream(In, DOM, SGMLOptions),
  400        close_any(Close)).
  401
  402sgml_open_options(Options, OpenOptions, SGMLOptions) :-
  403    Options = M:Plain,
  404    (   select_option(encoding(Encoding), Plain, NoEnc)
  405    ->  (   sgml_encoding(Encoding)
  406        ->  merge_options(NoEnc, [type(binary)], OpenOptions),
  407            SGMLOptions = Options
  408        ;   OpenOptions = Plain,
  409            SGMLOptions = M:NoEnc
  410        )
  411    ;   merge_options(Plain, [type(binary)], OpenOptions),
  412        SGMLOptions = Options
  413    ).
  414
  415sgml_encoding(Enc) :-
  416    downcase_atom(Enc, Enc1),
  417    sgml_encoding_l(Enc1).
  418
  419sgml_encoding_l('iso-8859-1').
  420sgml_encoding_l('us-ascii').
  421sgml_encoding_l('utf-8').
  422sgml_encoding_l('utf8').
  423sgml_encoding_l('iso_latin_1').
  424sgml_encoding_l('ascii').
  425
  426load_structure_from_stream(In, Term, M:Options) :-
  427    (   select_option(dtd(DTD), Options, Options1)
  428    ->  ExplicitDTD = true
  429    ;   ExplicitDTD = false,
  430        Options1 = Options
  431    ),
  432    move_front(Options1, dialect(_), Options2), % dialect sets defaults
  433    setup_call_cleanup(
  434        new_sgml_parser(Parser,
  435                        [ dtd(DTD)
  436                        ]),
  437        parse(Parser, M:Options2, TermRead, In),
  438        free_sgml_parser(Parser)),
  439    (   ExplicitDTD == true
  440    ->  (   DTD = dtd(_, DocType),
  441            dtd_property(DTD, doctype(DocType))
  442        ->  true
  443        ;   true
  444        )
  445    ;   free_dtd(DTD)
  446    ),
  447    Term = TermRead.
  448
  449move_front(Options0, Opt, Options) :-
  450    selectchk(Opt, Options0, Options1),
  451    !,
  452    Options = [Opt|Options1].
  453move_front(Options, _, Options).
  454
  455
  456parse(Parser, M:Options, Document, In) :-
  457    set_parser_options(Options, Parser, In, Options1),
  458    parser_meta_options(Options1, M, Options2),
  459    set_input_location(Parser, In),
  460    sgml_parse(Parser,
  461               [ document(Document),
  462                 source(In)
  463               | Options2
  464               ]).
  465
  466set_parser_options([], _, _, []).
  467set_parser_options([H|T], Parser, In, Rest) :-
  468    (   set_parser_option(H, Parser, In)
  469    ->  set_parser_options(T, Parser, In, Rest)
  470    ;   Rest = [H|R2],
  471        set_parser_options(T, Parser, In, R2)
  472    ).
  473
  474set_parser_option(Var, _Parser, _In) :-
  475    var(Var),
  476    !,
  477    instantiation_error(Var).
  478set_parser_option(Option, Parser, _) :-
  479    def_entity(Option, Parser),
  480    !.
  481set_parser_option(offset(Offset), _Parser, In) :-
  482    !,
  483    seek(In, Offset, bof, _).
  484set_parser_option(Option, Parser, _In) :-
  485    parser_option(Option),
  486    !,
  487    set_sgml_parser(Parser, Option).
  488set_parser_option(Name=Value, Parser, In) :-
  489    Option =.. [Name,Value],
  490    set_parser_option(Option, Parser, In).
  491
  492
  493parser_option(dialect(_)).
  494parser_option(shorttag(_)).
  495parser_option(case_sensitive_attributes(_)).
  496parser_option(case_preserving_attributes(_)).
  497parser_option(system_entities(_)).
  498parser_option(max_memory(_)).
  499parser_option(file(_)).
  500parser_option(line(_)).
  501parser_option(space(_)).
  502parser_option(number(_)).
  503parser_option(defaults(_)).
  504parser_option(doctype(_)).
  505parser_option(qualify_attributes(_)).
  506parser_option(encoding(_)).
  507parser_option(keep_prefix(_)).
  508
  509
  510def_entity(entity(Name, Value), Parser) :-
  511    get_sgml_parser(Parser, dtd(DTD)),
  512    xml_quote_attribute(Value, QValue),
  513    setup_call_cleanup(open_dtd(DTD, [], Stream),
  514                       format(Stream, '<!ENTITY ~w "~w">~n',
  515                              [Name, QValue]),
  516                       close(Stream)).
  517def_entity(xmlns(URI), Parser) :-
  518    set_sgml_parser(Parser, xmlns(URI)).
  519def_entity(xmlns(NS, URI), Parser) :-
  520    set_sgml_parser(Parser, xmlns(NS, URI)).
  521
  522%!  parser_meta_options(+Options0, +Module, -Options)
  523%
  524%   Qualify meta-calling options to the parser.
  525
  526parser_meta_options([], _, []).
  527parser_meta_options([call(When, Closure)|T0], M, [call(When, M:Closure)|T]) :-
  528    !,
  529    parser_meta_options(T0, M, T).
  530parser_meta_options([H|T0], M, [H|T]) :-
  531    parser_meta_options(T0, M, T).
  532
  533
  534%!  set_input_location(+Parser, +In:stream) is det.
  535%
  536%   Set the input location if this was not set explicitly
  537
  538set_input_location(Parser, _In) :-
  539    get_sgml_parser(Parser, file(_)),
  540    !.
  541set_input_location(Parser, In) :-
  542    stream_property(In, file_name(File)),
  543    !,
  544    set_sgml_parser(Parser, file(File)),
  545    stream_property(In, position(Pos)),
  546    set_sgml_parser(Parser, position(Pos)).
  547set_input_location(_, _).
  548
  549                 /*******************************
  550                 *           UTILITIES          *
  551                 *******************************/
  552
  553%!  load_sgml_file(+File, -DOM) is det.
  554%
  555%   Load SGML from File and unify   the resulting DOM structure with
  556%   DOM.
  557%
  558%   @deprecated     New code should use load_sgml/3.
  559
  560load_sgml_file(File, Term) :-
  561    load_sgml(File, Term, []).
  562
  563%!  load_xml_file(+File, -DOM) is det.
  564%
  565%   Load XML from File and unify   the  resulting DOM structure with
  566%   DOM.
  567%
  568%   @deprecated     New code should use load_xml/3.
  569
  570load_xml_file(File, Term) :-
  571    load_xml(File, Term, []).
  572
  573%!  load_html_file(+File, -DOM) is det.
  574%
  575%   Load HTML from File and unify   the resulting DOM structure with
  576%   DOM.
  577%
  578%   @deprecated     New code should use load_html/3.
  579
  580load_html_file(File, DOM) :-
  581    load_html(File, DOM, []).
  582
  583%!  load_html(+Input, -DOM, +Options) is det.
  584%
  585%   Load HTML text from Input and  unify the resulting DOM structure
  586%   with DOM. Options are passed   to load_structure/3, after adding
  587%   the following default options:
  588%
  589%     - dtd(DTD)
  590%     Pass the DTD for HTML as obtained using dtd(html, DTD).
  591%     - dialect(Dialect)
  592%     Current dialect from the Prolog flag =html_dialect=
  593%     - max_errors(-1)
  594%     - syntax_errors(quiet)
  595%     Most HTML encountered in the wild contains errors. Even in the
  596%     context of errors, the resulting DOM term is often a
  597%     reasonable guess at the intent of the author.
  598%
  599%   You may also want to use  the library(http/http_open) to support
  600%   loading from HTTP and HTTPS URLs. For example:
  601%
  602%   ==
  603%   :- use_module(library(http/http_open)).
  604%   :- use_module(library(sgml)).
  605%
  606%   load_html_url(URL, DOM) :-
  607%       load_html(URL, DOM, []).
  608%   ==
  609
  610load_html(File, Term, M:Options) :-
  611    current_prolog_flag(html_dialect, Dialect),
  612    dtd(Dialect, DTD),
  613    merge_options(Options,
  614                  [ dtd(DTD),
  615                    dialect(Dialect),
  616                    max_errors(-1),
  617                    syntax_errors(quiet)
  618                  ], Options1),
  619    load_structure(File, Term, M:Options1).
  620
  621%!  load_xml(+Input, -DOM, +Options) is det.
  622%
  623%   Load XML text from Input and   unify the resulting DOM structure
  624%   with DOM. Options are passed   to load_structure/3, after adding
  625%   the following default options:
  626%
  627%     - dialect(xml)
  628
  629load_xml(Input, DOM, M:Options) :-
  630    merge_options(Options,
  631                  [ dialect(xml)
  632                  ], Options1),
  633    load_structure(Input, DOM, M:Options1).
  634
  635%!  load_sgml(+Input, -DOM, +Options) is det.
  636%
  637%   Load SGML text from Input and  unify the resulting DOM structure
  638%   with DOM. Options are passed   to load_structure/3, after adding
  639%   the following default options:
  640%
  641%     - dialect(sgml)
  642
  643load_sgml(Input, DOM, M:Options) :-
  644    merge_options(Options,
  645                  [ dialect(sgml)
  646                  ], Options1),
  647    load_structure(Input, DOM, M:Options1).
  648
  649
  650
  651                 /*******************************
  652                 *            ENCODING          *
  653                 *******************************/
  654
  655%!  xml_quote_attribute(+In, -Quoted) is det.
  656%!  xml_quote_cdata(+In, -Quoted) is det.
  657%
  658%   Backward  compatibility  for  versions  that  allow  to  specify
  659%   encoding. All characters that cannot fit the encoding are mapped
  660%   to XML character entities (&#dd;).  Using   ASCII  is the safest
  661%   value.
  662
  663xml_quote_attribute(In, Quoted) :-
  664    xml_quote_attribute(In, Quoted, ascii).
  665
  666xml_quote_cdata(In, Quoted) :-
  667    xml_quote_cdata(In, Quoted, ascii).
  668
  669%!  xml_name(+Atom) is semidet.
  670%
  671%   True if Atom is a valid XML name.
  672
  673xml_name(In) :-
  674    xml_name(In, ascii).
  675
  676
  677                 /*******************************
  678                 *    XML CHARACTER CLASSES     *
  679                 *******************************/
  680
  681%!  xml_basechar(+CodeOrChar) is semidet.
  682%!  xml_ideographic(+CodeOrChar) is semidet.
  683%!  xml_combining_char(+CodeOrChar) is semidet.
  684%!  xml_digit(+CodeOrChar) is semidet.
  685%!  xml_extender(+CodeOrChar) is semidet.
  686%
  687%   XML  character  classification   predicates.    Each   of  these
  688%   predicates accept both a character   (one-character  atom) and a
  689%   code (integer).
  690%
  691%   @see http://www.w3.org/TR/2006/REC-xml-20060816
  692
  693
  694                 /*******************************
  695                 *         TYPE CHECKING        *
  696                 *******************************/
  697
  698%!  xml_is_dom(@Term) is semidet.
  699%
  700%   True  if  term  statisfies   the    structure   as  returned  by
  701%   load_structure/3 and friends.
  702
  703xml_is_dom(0) :- !, fail.               % catch variables
  704xml_is_dom(List) :-
  705    is_list(List),
  706    !,
  707    xml_is_content_list(List).
  708xml_is_dom(Term) :-
  709    xml_is_element(Term).
  710
  711xml_is_content_list([]).
  712xml_is_content_list([H|T]) :-
  713    xml_is_content(H),
  714    xml_is_content_list(T).
  715
  716xml_is_content(0) :- !, fail.
  717xml_is_content(pi(Pi)) :-
  718    !,
  719    atom(Pi).
  720xml_is_content(CDATA) :-
  721    atom(CDATA),
  722    !.
  723xml_is_content(CDATA) :-
  724    string(CDATA),
  725    !.
  726xml_is_content(Term) :-
  727    xml_is_element(Term).
  728
  729xml_is_element(element(Name, Attributes, Content)) :-
  730    dom_name(Name),
  731    dom_attributes(Attributes),
  732    xml_is_content_list(Content).
  733
  734dom_name(NS:Local) :-
  735    atom(NS),
  736    atom(Local),
  737    !.
  738dom_name(Local) :-
  739    atom(Local).
  740
  741dom_attributes(0) :- !, fail.
  742dom_attributes([]).
  743dom_attributes([H|T]) :-
  744    dom_attribute(H),
  745    dom_attributes(T).
  746
  747dom_attribute(Name=Value) :-
  748    dom_name(Name),
  749    atomic(Value).
  750
  751
  752                 /*******************************
  753                 *            MESSAGES          *
  754                 *******************************/
  755:- multifile
  756    prolog:message/3.  757
  758%       Catch messages.  sgml/4 is generated by the SGML2PL binding.
  759
  760prolog:message(sgml(Parser, File, Line, Message)) -->
  761    { get_sgml_parser(Parser, dialect(Dialect))
  762    },
  763    [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ].
  764
  765
  766                 /*******************************
  767                 *         XREF SUPPORT         *
  768                 *******************************/
  769
  770:- multifile
  771    prolog:called_by/2.  772
  773prolog:called_by(sgml_parse(_, Options), Called) :-
  774    findall(Meta, meta_call_term(_, Meta, Options), Called).
  775
  776meta_call_term(T, G+N, Options) :-
  777    T = call(Event, G),
  778    pmember(T, Options),
  779    call_params(Event, Term),
  780    functor(Term, _, N).
  781
  782pmember(X, List) :-                     % member for partial lists
  783    nonvar(List),
  784    List = [H|T],
  785    (   X = H
  786    ;   pmember(X, T)
  787    ).
  788
  789call_params(begin, begin(tag,attributes,parser)).
  790call_params(end,   end(tag,parser)).
  791call_params(cdata, cdata(cdata,parser)).
  792call_params(pi,    pi(cdata,parser)).
  793call_params(decl,  decl(cdata,parser)).
  794call_params(error, error(severity,message,parser)).
  795call_params(xmlns, xmlns(namespace,url,parser)).
  796call_params(urlns, urlns(url,url,parser)).
  797
  798                 /*******************************
  799                 *           SANDBOX            *
  800                 *******************************/
  801
  802:- multifile
  803    sandbox:safe_primitive/1,
  804    sandbox:safe_meta_predicate/1.  805
  806sandbox:safe_meta_predicate(sgml:load_structure/3).
  807sandbox:safe_primitive(sgml:dtd(Dialect, _)) :-
  808    dtd_alias(Dialect, _).
  809sandbox:safe_primitive(sgml:xml_quote_attribute(_,_,_)).
  810sandbox:safe_primitive(sgml:xml_quote_cdata(_,_,_)).
  811sandbox:safe_primitive(sgml:xml_name(_,_)).
  812sandbox:safe_primitive(sgml:xml_basechar(_)).
  813sandbox:safe_primitive(sgml:xml_ideographic(_)).
  814sandbox:safe_primitive(sgml:xml_combining_char(_)).
  815sandbox:safe_primitive(sgml:xml_digit(_)).
  816sandbox:safe_primitive(sgml:xml_extender(_)).
  817sandbox:safe_primitive(sgml:iri_xml_namespace(_,_,_)).
  818sandbox:safe_primitive(sgml:xsd_number_string(_,_)).
  819sandbox:safe_primitive(sgml:xsd_time_string(_,_,_))