1/* Part of SWI-Prolog 2 3 Author: Jan Wielemaker 4 E-mail: J.Wielemaker@vu.nl 5 WWW: http://www.swi-prolog.org 6 Copyright (c) 2000-2018, University of Amsterdam 7 VU University Amsterdam 8 All rights reserved. 9 10 Redistribution and use in source and binary forms, with or without 11 modification, are permitted provided that the following conditions 12 are met: 13 14 1. Redistributions of source code must retain the above copyright 15 notice, this list of conditions and the following disclaimer. 16 17 2. Redistributions in binary form must reproduce the above copyright 18 notice, this list of conditions and the following disclaimer in 19 the documentation and/or other materials provided with the 20 distribution. 21 22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 30 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 32 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 33 POSSIBILITY OF SUCH DAMAGE. 34*/ 35 36:- module(sgml, 37 [ load_html/3, % +Input, -DOM, +Options 38 load_xml/3, % +Input, -DOM, +Options 39 load_sgml/3, % +Input, -DOM, +Options 40 41 load_sgml_file/2, % +File, -ListOfContent 42 load_xml_file/2, % +File, -ListOfContent 43 load_html_file/2, % +File, -Document 44 45 load_structure/3, % +File, -Term, +Options 46 47 load_dtd/2, % +DTD, +File 48 load_dtd/3, % +DTD, +File, +Options 49 dtd/2, % +Type, -DTD 50 dtd_property/2, % +DTD, ?Property 51 52 new_dtd/2, % +Doctype, -DTD 53 free_dtd/1, % +DTD 54 open_dtd/3, % +DTD, +Options, -Stream 55 56 new_sgml_parser/2, % -Parser, +Options 57 free_sgml_parser/1, % +Parser 58 set_sgml_parser/2, % +Parser, +Options 59 get_sgml_parser/2, % +Parser, +Options 60 sgml_parse/2, % +Parser, +Options 61 62 sgml_register_catalog_file/2, % +File, +StartOrEnd 63 64 xml_quote_attribute/3, % +In, -Quoted, +Encoding 65 xml_quote_cdata/3, % +In, -Quoted, +Encoding 66 xml_quote_attribute/2, % +In, -Quoted 67 xml_quote_cdata/2, % +In, -Quoted 68 xml_name/1, % +In 69 xml_name/2, % +In, +Encoding 70 71 xsd_number_string/2, % ?Number, ?String 72 xsd_time_string/3, % ?Term, ?Type, ?String 73 74 xml_basechar/1, % +Code 75 xml_ideographic/1, % +Code 76 xml_combining_char/1, % +Code 77 xml_digit/1, % +Code 78 xml_extender/1, % +Code 79 80 iri_xml_namespace/2, % +IRI, -Namespace 81 iri_xml_namespace/3, % +IRI, -Namespace, -LocalName 82 xml_is_dom/1 % +Term 83 ]). 84:- use_module(library(lists)). 85:- use_module(library(option)). 86:- use_module(library(error)). 87:- use_module(library(iostream)). 88 89:- meta_predicate 90 load_structure( , , ), 91 load_html( , , ), 92 load_xml( , , ), 93 load_sgml( , , ). 94 95:- predicate_options(load_structure/3, 3, 96 [ charpos(integer), 97 cdata(oneof([atom,string])), 98 defaults(boolean), 99 dialect(oneof([html,html4,html5,sgml,xhtml,xhtml5,xml,xmlns])), 100 doctype(atom), 101 dtd(any), 102 encoding(oneof(['iso-8859-1', 'utf-8', 'us-ascii'])), 103 entity(atom,atom), 104 keep_prefix(boolean), 105 file(atom), 106 line(integer), 107 offset(integer), 108 number(oneof([token,integer])), 109 qualify_attributes(boolean), 110 shorttag(boolean), 111 case_sensitive_attributes(boolean), 112 case_preserving_attributes(boolean), 113 system_entities(boolean), 114 max_memory(integer), 115 space(oneof([sgml,preserve,default,remove])), 116 xmlns(atom), 117 xmlns(atom,atom), 118 pass_to(sgml_parse/2, 2) 119 ]). 120:- predicate_options(load_html/3, 3, 121 [ pass_to(load_structure/3, 3) 122 ]). 123:- predicate_options(load_xml/3, 3, 124 [ pass_to(load_structure/3, 3) 125 ]). 126:- predicate_options(load_sgml/3, 3, 127 [ pass_to(load_structure/3, 3) 128 ]). 129:- predicate_options(load_dtd/3, 3, 130 [ dialect(oneof([sgml,xml,xmlns])), 131 pass_to(open/4, 4) 132 ]). 133:- predicate_options(sgml_parse/2, 2, 134 [ call(oneof([begin,end,cdata,pi,decl,error,xmlns,urlns]), 135 callable), 136 cdata(oneof([atom,string])), 137 content_length(integer), 138 document(-any), 139 max_errors(integer), 140 parse(oneof([file,element,content,declaration,input])), 141 source(any), 142 syntax_errors(oneof([quiet,print,style])), 143 xml_no_ns(oneof([error,quiet])) 144 ]). 145:- predicate_options(new_sgml_parser/2, 2, 146 [ dtd(any) 147 ]).
177:- multifile user:file_search_path/2. 178:- dynamic user:file_search_path/2. 179 180user:file_search_path(dtd, '.'). 181user:file_search_path(dtd, swi('library/DTD')). 182 183sgml_register_catalog_file(File, Location) :- 184 prolog_to_os_filename(File, OsFile), 185 '_sgml_register_catalog_file'(OsFile, Location). 186 187:- use_foreign_library(foreign(sgml2pl)). 188 189register_catalog(Base) :- 190 absolute_file_name(dtd(Base), 191 [ extensions([soc]), 192 access(read), 193 file_errors(fail) 194 ], 195 SocFile), 196 sgml_register_catalog_file(SocFile, end). 197 198:- initialization 199 ignore(register_catalog('HTML4')). 200 201 202 /******************************* 203 * DTD HANDLING * 204 *******************************/ 205 206/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 207Note that concurrent access to DTD objects is not allowed, and hence we 208will allocate and destroy them in each thread. Possibibly it would be 209nicer to find out why concurrent access to DTD's is flawed. It is 210diagnosed to mess with the entity resolution by Fabien Todescato. 211- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ 212 213:- thread_local 214 current_dtd/2. 215:- volatile 216 current_dtd/2. 217:- thread_local 218 registered_cleanup/0. 219:- volatile 220 registered_cleanup/0. 221 222:- multifile 223 dtd_alias/2. 224 225:- create_prolog_flag(html_dialect, html5, [type(atom)]). 226 227dtd_alias(html4, 'HTML4'). 228dtd_alias(html5, 'HTML5'). 229dtd_alias(html, DTD) :- 230 current_prolog_flag(html_dialect, Dialect), 231 dtd_alias(Dialect, DTD).
dtd(Type)
. Loaded DTD
objects are cached. Note that DTD objects may not be shared
between threads. Therefore, dtd/2 maintains the pool of DTD
objects using a thread_local predicate. DTD objects are
destroyed if a thread terminates.
243dtd(Type, DTD) :- 244 current_dtd(Type, DTD), 245 !. 246dtd(Type, DTD) :- 247 new_dtd(Type, DTD), 248 ( dtd_alias(Type, Base) 249 -> true 250 ; Base = Type 251 ), 252 absolute_file_name(dtd(Base), 253 [ extensions([dtd]), 254 access(read) 255 ], DtdFile), 256 load_dtd(DTD, DtdFile), 257 register_cleanup, 258 asserta(current_dtd(Type, DTD)).
273load_dtd(DTD, DtdFile) :- 274 load_dtd(DTD, DtdFile, []). 275load_dtd(DTD, DtdFile, Options) :- 276 sgml_open_options(sgml:Options, OpenOptions, sgml:DTDOptions), 277 setup_call_cleanup( 278 open_dtd(DTD, DTDOptions, DtdOut), 279 setup_call_cleanup( 280 open(DtdFile, read, DtdIn, OpenOptions), 281 copy_stream_data(DtdIn, DtdOut), 282 close(DtdIn)), 283 close(DtdOut)). 284 285split_dtd_options([], [], []). 286split_dtd_options([H|T], [H|TD], S) :- 287 dtd_option(H), 288 !, 289 split_dtd_options(T, TD, S). 290split_dtd_options([H|T], TD, [H|S]) :- 291 split_dtd_options(T, TD, S). 292 293dtd_option(dialect(_)).
301destroy_dtds :-
302 ( current_dtd(_Type, DTD),
303 free_dtd(DTD),
304 fail
305 ; true
306 ).
312register_cleanup :- 313 registered_cleanup, 314 !. 315register_cleanup :- 316 catch(thread_at_exit(destroy_dtds), _, true), 317 assert(registered_cleanup). 318 319 320 /******************************* 321 * EXAMINE DTD * 322 *******************************/ 323 324prop(doctype(_), _). 325prop(elements(_), _). 326prop(entities(_), _). 327prop(notations(_), _). 328prop(entity(E, _), DTD) :- 329 ( nonvar(E) 330 -> true 331 ; '$dtd_property'(DTD, entities(EL)), 332 member(E, EL) 333 ). 334prop(element(E, _, _), DTD) :- 335 ( nonvar(E) 336 -> true 337 ; '$dtd_property'(DTD, elements(EL)), 338 member(E, EL) 339 ). 340prop(attributes(E, _), DTD) :- 341 ( nonvar(E) 342 -> true 343 ; '$dtd_property'(DTD, elements(EL)), 344 member(E, EL) 345 ). 346prop(attribute(E, A, _, _), DTD) :- 347 ( nonvar(E) 348 -> true 349 ; '$dtd_property'(DTD, elements(EL)), 350 member(E, EL) 351 ), 352 ( nonvar(A) 353 -> true 354 ; '$dtd_property'(DTD, attributes(E, AL)), 355 member(A, AL) 356 ). 357prop(notation(N, _), DTD) :- 358 ( nonvar(N) 359 -> true 360 ; '$dtd_property'(DTD, notations(NL)), 361 member(N, NL) 362 ). 363 364dtd_property(DTD, Prop) :- 365 prop(Prop, DTD), 366 '$dtd_property'(DTD, Prop). 367 368 369 /******************************* 370 * SGML * 371 *******************************/
A proper XML document contains only a single toplevel element whose name matches the document type. Nevertheless, a list is returned for consistency with the representation of element content.
The encoding(+Encoding)
option is treated special for
compatibility reasons:
iso-8859-1
, us-ascii
or utf-8
,
the stream is opened in binary mode and the option is passed
to the SGML parser.395load_structure(Spec, DOM, Options) :- 396 sgml_open_options(Options, OpenOptions, SGMLOptions), 397 setup_call_cleanup( 398 open_any(Spec, read, In, Close, OpenOptions), 399 load_structure_from_stream(In, DOM, SGMLOptions), 400 close_any(Close)). 401 402sgml_open_options(Options, OpenOptions, SGMLOptions) :- 403 Options = M:Plain, 404 ( select_option(encoding(Encoding), Plain, NoEnc) 405 -> ( sgml_encoding(Encoding) 406 -> merge_options(NoEnc, [type(binary)], OpenOptions), 407 SGMLOptions = Options 408 ; OpenOptions = Plain, 409 SGMLOptions = M:NoEnc 410 ) 411 ; merge_options(Plain, [type(binary)], OpenOptions), 412 SGMLOptions = Options 413 ). 414 415sgml_encoding(Enc) :- 416 downcase_atom(Enc, Enc1), 417 sgml_encoding_l(Enc1). 418 419sgml_encoding_l('iso-8859-1'). 420sgml_encoding_l('us-ascii'). 421sgml_encoding_l('utf-8'). 422sgml_encoding_l('utf8'). 423sgml_encoding_l('iso_latin_1'). 424sgml_encoding_l('ascii'). 425 426load_structure_from_stream(In, Term, M:Options) :- 427 ( select_option(dtd(DTD), Options, Options1) 428 -> ExplicitDTD = true 429 ; ExplicitDTD = false, 430 Options1 = Options 431 ), 432 move_front(Options1, dialect(_), Options2), % dialect sets defaults 433 setup_call_cleanup( 434 new_sgml_parser(Parser, 435 [ dtd(DTD) 436 ]), 437 parse(Parser, M:Options2, TermRead, In), 438 free_sgml_parser(Parser)), 439 ( ExplicitDTD == true 440 -> ( DTD = dtd(_, DocType), 441 dtd_property(DTD, doctype(DocType)) 442 -> true 443 ; true 444 ) 445 ; free_dtd(DTD) 446 ), 447 Term = TermRead. 448 449move_front(Options0, Opt, Options) :- 450 selectchk(Opt, Options0, Options1), 451 !, 452 Options = [Opt|Options1]. 453move_front(Options, _, Options). 454 455 456parse(Parser, M:Options, Document, In) :- 457 set_parser_options(Options, Parser, In, Options1), 458 parser_meta_options(Options1, M, Options2), 459 set_input_location(Parser, In), 460 sgml_parse(Parser, 461 [ document(Document), 462 source(In) 463 | Options2 464 ]). 465 466set_parser_options([], _, _, []). 467set_parser_options([H|T], Parser, In, Rest) :- 468 ( set_parser_option(H, Parser, In) 469 -> set_parser_options(T, Parser, In, Rest) 470 ; Rest = [H|R2], 471 set_parser_options(T, Parser, In, R2) 472 ). 473 474set_parser_option(Var, _Parser, _In) :- 475 var(Var), 476 !, 477 instantiation_error(Var). 478set_parser_option(Option, Parser, _) :- 479 def_entity(Option, Parser), 480 !. 481set_parser_option(offset(Offset), _Parser, In) :- 482 !, 483 seek(In, Offset, bof, _). 484set_parser_option(Option, Parser, _In) :- 485 parser_option(Option), 486 !, 487 set_sgml_parser(Parser, Option). 488set_parser_option(Name=Value, Parser, In) :- 489 Option =.. [Name,Value], 490 set_parser_option(Option, Parser, In). 491 492 493parser_option(dialect(_)). 494parser_option(shorttag(_)). 495parser_option(case_sensitive_attributes(_)). 496parser_option(case_preserving_attributes(_)). 497parser_option(system_entities(_)). 498parser_option(max_memory(_)). 499parser_option(file(_)). 500parser_option(line(_)). 501parser_option(space(_)). 502parser_option(number(_)). 503parser_option(defaults(_)). 504parser_option(doctype(_)). 505parser_option(qualify_attributes(_)). 506parser_option(encoding(_)). 507parser_option(keep_prefix(_)). 508 509 510def_entity(entity(Name, Value), Parser) :- 511 get_sgml_parser(Parser, dtd(DTD)), 512 xml_quote_attribute(Value, QValue), 513 setup_call_cleanup(open_dtd(DTD, [], Stream), 514 format(Stream, '<!ENTITY ~w "~w">~n', 515 [Name, QValue]), 516 close(Stream)). 517def_entity(xmlns(URI), Parser) :- 518 set_sgml_parser(Parser, xmlns(URI)). 519def_entity(xmlns(NS, URI), Parser) :- 520 set_sgml_parser(Parser, xmlns(NS, URI)).
526parser_meta_options([], _, []). 527parser_meta_options([call(When, Closure)|T0], M, [call(When, M:Closure)|T]) :- 528 !, 529 parser_meta_options(T0, M, T). 530parser_meta_options([H|T0], M, [H|T]) :- 531 parser_meta_options(T0, M, T).
538set_input_location(Parser, _In) :- 539 get_sgml_parser(Parser, file(_)), 540 !. 541set_input_location(Parser, In) :- 542 stream_property(In, file_name(File)), 543 !, 544 set_sgml_parser(Parser, file(File)), 545 stream_property(In, position(Pos)), 546 set_sgml_parser(Parser, position(Pos)). 547set_input_location(_, _). 548 549 /******************************* 550 * UTILITIES * 551 *******************************/
560load_sgml_file(File, Term) :-
561 load_sgml(File, Term, []).
570load_xml_file(File, Term) :-
571 load_xml(File, Term, []).
580load_html_file(File, DOM) :-
581 load_html(File, DOM, []).
dtd(html, DTD)
.html_dialect
You may also want to use the library(http/http_open)
to support
loading from HTTP and HTTPS URLs. For example:
:- use_module(library(http/http_open)). :- use_module(library(sgml)). load_html_url(URL, DOM) :- load_html(URL, DOM, []).
610load_html(File, Term, M:Options) :-
611 current_prolog_flag(html_dialect, Dialect),
612 dtd(Dialect, DTD),
613 merge_options(Options,
614 [ dtd(DTD),
615 dialect(Dialect),
616 max_errors(-1),
617 syntax_errors(quiet)
618 ], Options1),
619 load_structure(File, Term, M:Options1).
dialect(xml)
629load_xml(Input, DOM, M:Options) :-
630 merge_options(Options,
631 [ dialect(xml)
632 ], Options1),
633 load_structure(Input, DOM, M:Options1).
dialect(sgml)
643load_sgml(Input, DOM, M:Options) :- 644 merge_options(Options, 645 [ dialect(sgml) 646 ], Options1), 647 load_structure(Input, DOM, M:Options1). 648 649 650 651 /******************************* 652 * ENCODING * 653 *******************************/
663xml_quote_attribute(In, Quoted) :- 664 xml_quote_attribute(In, Quoted, ascii). 665 666xml_quote_cdata(In, Quoted) :- 667 xml_quote_cdata(In, Quoted, ascii).
673xml_name(In) :- 674 xml_name(In, ascii). 675 676 677 /******************************* 678 * XML CHARACTER CLASSES * 679 *******************************/
694 /******************************* 695 * TYPE CHECKING * 696 *******************************/
703xml_is_dom(0) :- !, fail. % catch variables 704xml_is_dom(List) :- 705 is_list(List), 706 !, 707 xml_is_content_list(List). 708xml_is_dom(Term) :- 709 xml_is_element(Term). 710 711xml_is_content_list([]). 712xml_is_content_list([H|T]) :- 713 xml_is_content(H), 714 xml_is_content_list(T). 715 716xml_is_content(0) :- !, fail. 717xml_is_content(pi(Pi)) :- 718 !, 719 atom(Pi). 720xml_is_content(CDATA) :- 721 atom(CDATA), 722 !. 723xml_is_content(CDATA) :- 724 string(CDATA), 725 !. 726xml_is_content(Term) :- 727 xml_is_element(Term). 728 729xml_is_element(element(Name, Attributes, Content)) :- 730 dom_name(Name), 731 dom_attributes(Attributes), 732 xml_is_content_list(Content). 733 734dom_name(NS:Local) :- 735 atom(NS), 736 atom(Local), 737 !. 738dom_name(Local) :- 739 atom(Local). 740 741dom_attributes(0) :- !, fail. 742dom_attributes([]). 743dom_attributes([H|T]) :- 744 dom_attribute(H), 745 dom_attributes(T). 746 747dom_attribute(Name=Value) :- 748 dom_name(Name), 749 atomic(Value). 750 751 752 /******************************* 753 * MESSAGES * 754 *******************************/ 755:- multifile 756 prolog:message/3. 757 758% Catch messages. sgml/4 is generated by the SGML2PL binding. 759 760prologmessage(sgml(Parser, File, Line, Message)) --> 761 { get_sgml_parser(Parser, dialect(Dialect)) 762 }, 763 [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ]. 764 765 766 /******************************* 767 * XREF SUPPORT * 768 *******************************/ 769 770:- multifile 771 prolog:called_by/2. 772 773prologcalled_by(sgml_parse(_, Options), Called) :- 774 findall(Meta, meta_call_term(_, Meta, Options), Called). 775 776meta_call_term(T, G+N, Options) :- 777 T = call(Event, G), 778 pmember(T, Options), 779 call_params(Event, Term), 780 functor(Term, _, N). 781 782pmember(X, List) :- % member for partial lists 783 nonvar(List), 784 List = [H|T], 785 ( X = H 786 ; pmember(X, T) 787 ). 788 789call_params(begin, begin(tag,attributes,parser)). 790call_params(end, end(tag,parser)). 791call_params(cdata, cdata(cdata,parser)). 792call_params(pi, pi(cdata,parser)). 793call_params(decl, decl(cdata,parser)). 794call_params(error, error(severity,message,parser)). 795call_params(xmlns, xmlns(namespace,url,parser)). 796call_params(urlns, urlns(url,url,parser)). 797 798 /******************************* 799 * SANDBOX * 800 *******************************/ 801 802:- multifile 803 sandbox:safe_primitive/1, 804 sandbox:safe_meta_predicate/1. 805 806sandbox:safe_meta_predicate(sgml:load_structure/3). 807sandbox:safe_primitive(sgml:dtd(Dialect, _)) :- 808 dtd_alias(Dialect, _). 809sandbox:safe_primitive(sgml:xml_quote_attribute(_,_,_)). 810sandbox:safe_primitive(sgml:xml_quote_cdata(_,_,_)). 811sandbox:safe_primitive(sgml:xml_name(_,_)). 812sandbox:safe_primitive(sgml:xml_basechar(_)). 813sandbox:safe_primitive(sgml:xml_ideographic(_)). 814sandbox:safe_primitive(sgml:xml_combining_char(_)). 815sandbox:safe_primitive(sgml:xml_digit(_)). 816sandbox:safe_primitive(sgml:xml_extender(_)). 817sandbox:safe_primitive(sgml:iri_xml_namespace(_,_,_)). 818sandbox:safe_primitive(sgml:xsd_number_string(_,_)). 819sandbox:safe_primitive(sgml:xsd_time_string(_,_,_))
SGML, XML and HTML parser
This library allows you to parse SGML, XML and HTML data into a Prolog data structure. The library defines several families of predicates:
The DOM structure can be used by
library(xpath)
to extract information from the document.