35
36:- module(sgml,
37 [ load_html/3, 38 load_xml/3, 39 load_sgml/3, 40
41 load_sgml_file/2, 42 load_xml_file/2, 43 load_html_file/2, 44
45 load_structure/3, 46
47 load_dtd/2, 48 load_dtd/3, 49 dtd/2, 50 dtd_property/2, 51
52 new_dtd/2, 53 free_dtd/1, 54 open_dtd/3, 55
56 new_sgml_parser/2, 57 free_sgml_parser/1, 58 set_sgml_parser/2, 59 get_sgml_parser/2, 60 sgml_parse/2, 61
62 sgml_register_catalog_file/2, 63
64 xml_quote_attribute/3, 65 xml_quote_cdata/3, 66 xml_quote_attribute/2, 67 xml_quote_cdata/2, 68 xml_name/1, 69 xml_name/2, 70
71 xsd_number_string/2, 72 xsd_time_string/3, 73
74 xml_basechar/1, 75 xml_ideographic/1, 76 xml_combining_char/1, 77 xml_digit/1, 78 xml_extender/1, 79
80 iri_xml_namespace/2, 81 iri_xml_namespace/3, 82 xml_is_dom/1 83 ]). 84:- use_module(library(lists)). 85:- use_module(library(option)). 86:- use_module(library(error)). 87:- use_module(library(iostream)). 88
89:- meta_predicate
90 load_structure(+, -, :),
91 load_html(+, -, :),
92 load_xml(+, -, :),
93 load_sgml(+, -, :). 94
95:- predicate_options(load_structure/3, 3,
96 [ charpos(integer),
97 cdata(oneof([atom,string])),
98 defaults(boolean),
99 dialect(oneof([html,html4,html5,sgml,xhtml,xhtml5,xml,xmlns])),
100 doctype(atom),
101 dtd(any),
102 encoding(oneof(['iso-8859-1', 'utf-8', 'us-ascii'])),
103 entity(atom,atom),
104 keep_prefix(boolean),
105 file(atom),
106 line(integer),
107 offset(integer),
108 number(oneof([token,integer])),
109 qualify_attributes(boolean),
110 shorttag(boolean),
111 case_sensitive_attributes(boolean),
112 case_preserving_attributes(boolean),
113 system_entities(boolean),
114 max_memory(integer),
115 space(oneof([sgml,preserve,default,remove])),
116 xmlns(atom),
117 xmlns(atom,atom),
118 pass_to(sgml_parse/2, 2)
119 ]). 120:- predicate_options(load_html/3, 3,
121 [ pass_to(load_structure/3, 3)
122 ]). 123:- predicate_options(load_xml/3, 3,
124 [ pass_to(load_structure/3, 3)
125 ]). 126:- predicate_options(load_sgml/3, 3,
127 [ pass_to(load_structure/3, 3)
128 ]). 129:- predicate_options(load_dtd/3, 3,
130 [ dialect(oneof([sgml,xml,xmlns])),
131 pass_to(open/4, 4)
132 ]). 133:- predicate_options(sgml_parse/2, 2,
134 [ call(oneof([begin,end,cdata,pi,decl,error,xmlns,urlns]),
135 callable),
136 cdata(oneof([atom,string])),
137 content_length(integer),
138 document(-any),
139 max_errors(integer),
140 parse(oneof([file,element,content,declaration,input])),
141 source(any),
142 syntax_errors(oneof([quiet,print,style])),
143 xml_no_ns(oneof([error,quiet]))
144 ]). 145:- predicate_options(new_sgml_parser/2, 2,
146 [ dtd(any)
147 ]). 148
149
176
177:- multifile user:file_search_path/2. 178:- dynamic user:file_search_path/2. 179
180user:file_search_path(dtd, '.').
181user:file_search_path(dtd, swi('library/DTD')).
182
183sgml_register_catalog_file(File, Location) :-
184 prolog_to_os_filename(File, OsFile),
185 '_sgml_register_catalog_file'(OsFile, Location).
186
187:- use_foreign_library(foreign(sgml2pl)). 188
189register_catalog(Base) :-
190 absolute_file_name(dtd(Base),
191 [ extensions([soc]),
192 access(read),
193 file_errors(fail)
194 ],
195 SocFile),
196 sgml_register_catalog_file(SocFile, end).
197
198:- initialization
199 ignore(register_catalog('HTML4')). 200
201
202 205
212
213:- thread_local
214 current_dtd/2. 215:- volatile
216 current_dtd/2. 217:- thread_local
218 registered_cleanup/0. 219:- volatile
220 registered_cleanup/0. 221
222:- multifile
223 dtd_alias/2. 224
225:- create_prolog_flag(html_dialect, html5, [type(atom)]). 226
227dtd_alias(html4, 'HTML4').
228dtd_alias(html5, 'HTML5').
229dtd_alias(html, DTD) :-
230 current_prolog_flag(html_dialect, Dialect),
231 dtd_alias(Dialect, DTD).
232
242
243dtd(Type, DTD) :-
244 current_dtd(Type, DTD),
245 !.
246dtd(Type, DTD) :-
247 new_dtd(Type, DTD),
248 ( dtd_alias(Type, Base)
249 -> true
250 ; Base = Type
251 ),
252 absolute_file_name(dtd(Base),
253 [ extensions([dtd]),
254 access(read)
255 ], DtdFile),
256 load_dtd(DTD, DtdFile),
257 register_cleanup,
258 asserta(current_dtd(Type, DTD)).
259
272
273load_dtd(DTD, DtdFile) :-
274 load_dtd(DTD, DtdFile, []).
275load_dtd(DTD, DtdFile, Options) :-
276 sgml_open_options(sgml:Options, OpenOptions, sgml:DTDOptions),
277 setup_call_cleanup(
278 open_dtd(DTD, DTDOptions, DtdOut),
279 setup_call_cleanup(
280 open(DtdFile, read, DtdIn, OpenOptions),
281 copy_stream_data(DtdIn, DtdOut),
282 close(DtdIn)),
283 close(DtdOut)).
284
285split_dtd_options([], [], []).
286split_dtd_options([H|T], [H|TD], S) :-
287 dtd_option(H),
288 !,
289 split_dtd_options(T, TD, S).
290split_dtd_options([H|T], TD, [H|S]) :-
291 split_dtd_options(T, TD, S).
292
293dtd_option(dialect(_)).
294
295
300
301destroy_dtds :-
302 ( current_dtd(_Type, DTD),
303 free_dtd(DTD),
304 fail
305 ; true
306 ).
307
311
312register_cleanup :-
313 registered_cleanup,
314 !.
315register_cleanup :-
316 catch(thread_at_exit(destroy_dtds), _, true),
317 assert(registered_cleanup).
318
319
320 323
324prop(doctype(_), _).
325prop(elements(_), _).
326prop(entities(_), _).
327prop(notations(_), _).
328prop(entity(E, _), DTD) :-
329 ( nonvar(E)
330 -> true
331 ; '$dtd_property'(DTD, entities(EL)),
332 member(E, EL)
333 ).
334prop(element(E, _, _), DTD) :-
335 ( nonvar(E)
336 -> true
337 ; '$dtd_property'(DTD, elements(EL)),
338 member(E, EL)
339 ).
340prop(attributes(E, _), DTD) :-
341 ( nonvar(E)
342 -> true
343 ; '$dtd_property'(DTD, elements(EL)),
344 member(E, EL)
345 ).
346prop(attribute(E, A, _, _), DTD) :-
347 ( nonvar(E)
348 -> true
349 ; '$dtd_property'(DTD, elements(EL)),
350 member(E, EL)
351 ),
352 ( nonvar(A)
353 -> true
354 ; '$dtd_property'(DTD, attributes(E, AL)),
355 member(A, AL)
356 ).
357prop(notation(N, _), DTD) :-
358 ( nonvar(N)
359 -> true
360 ; '$dtd_property'(DTD, notations(NL)),
361 member(N, NL)
362 ).
363
364dtd_property(DTD, Prop) :-
365 prop(Prop, DTD),
366 '$dtd_property'(DTD, Prop).
367
368
369 372
394
395load_structure(Spec, DOM, Options) :-
396 sgml_open_options(Options, OpenOptions, SGMLOptions),
397 setup_call_cleanup(
398 open_any(Spec, read, In, Close, OpenOptions),
399 load_structure_from_stream(In, DOM, SGMLOptions),
400 close_any(Close)).
401
402sgml_open_options(Options, OpenOptions, SGMLOptions) :-
403 Options = M:Plain,
404 ( select_option(encoding(Encoding), Plain, NoEnc)
405 -> ( sgml_encoding(Encoding)
406 -> merge_options(NoEnc, [type(binary)], OpenOptions),
407 SGMLOptions = Options
408 ; OpenOptions = Plain,
409 SGMLOptions = M:NoEnc
410 )
411 ; merge_options(Plain, [type(binary)], OpenOptions),
412 SGMLOptions = Options
413 ).
414
415sgml_encoding(Enc) :-
416 downcase_atom(Enc, Enc1),
417 sgml_encoding_l(Enc1).
418
419sgml_encoding_l('iso-8859-1').
420sgml_encoding_l('us-ascii').
421sgml_encoding_l('utf-8').
422sgml_encoding_l('utf8').
423sgml_encoding_l('iso_latin_1').
424sgml_encoding_l('ascii').
425
426load_structure_from_stream(In, Term, M:Options) :-
427 ( select_option(dtd(DTD), Options, Options1)
428 -> ExplicitDTD = true
429 ; ExplicitDTD = false,
430 Options1 = Options
431 ),
432 move_front(Options1, dialect(_), Options2), 433 setup_call_cleanup(
434 new_sgml_parser(Parser,
435 [ dtd(DTD)
436 ]),
437 parse(Parser, M:Options2, TermRead, In),
438 free_sgml_parser(Parser)),
439 ( ExplicitDTD == true
440 -> ( DTD = dtd(_, DocType),
441 dtd_property(DTD, doctype(DocType))
442 -> true
443 ; true
444 )
445 ; free_dtd(DTD)
446 ),
447 Term = TermRead.
448
449move_front(Options0, Opt, Options) :-
450 selectchk(Opt, Options0, Options1),
451 !,
452 Options = [Opt|Options1].
453move_front(Options, _, Options).
454
455
456parse(Parser, M:Options, Document, In) :-
457 set_parser_options(Options, Parser, In, Options1),
458 parser_meta_options(Options1, M, Options2),
459 set_input_location(Parser, In),
460 sgml_parse(Parser,
461 [ document(Document),
462 source(In)
463 | Options2
464 ]).
465
466set_parser_options([], _, _, []).
467set_parser_options([H|T], Parser, In, Rest) :-
468 ( set_parser_option(H, Parser, In)
469 -> set_parser_options(T, Parser, In, Rest)
470 ; Rest = [H|R2],
471 set_parser_options(T, Parser, In, R2)
472 ).
473
474set_parser_option(Var, _Parser, _In) :-
475 var(Var),
476 !,
477 instantiation_error(Var).
478set_parser_option(Option, Parser, _) :-
479 def_entity(Option, Parser),
480 !.
481set_parser_option(offset(Offset), _Parser, In) :-
482 !,
483 seek(In, Offset, bof, _).
484set_parser_option(Option, Parser, _In) :-
485 parser_option(Option),
486 !,
487 set_sgml_parser(Parser, Option).
488set_parser_option(Name=Value, Parser, In) :-
489 Option =.. [Name,Value],
490 set_parser_option(Option, Parser, In).
491
492
493parser_option(dialect(_)).
494parser_option(shorttag(_)).
495parser_option(case_sensitive_attributes(_)).
496parser_option(case_preserving_attributes(_)).
497parser_option(system_entities(_)).
498parser_option(max_memory(_)).
499parser_option(file(_)).
500parser_option(line(_)).
501parser_option(space(_)).
502parser_option(number(_)).
503parser_option(defaults(_)).
504parser_option(doctype(_)).
505parser_option(qualify_attributes(_)).
506parser_option(encoding(_)).
507parser_option(keep_prefix(_)).
508
509
510def_entity(entity(Name, Value), Parser) :-
511 get_sgml_parser(Parser, dtd(DTD)),
512 xml_quote_attribute(Value, QValue),
513 setup_call_cleanup(open_dtd(DTD, [], Stream),
514 format(Stream, '<!ENTITY ~w "~w">~n',
515 [Name, QValue]),
516 close(Stream)).
517def_entity(xmlns(URI), Parser) :-
518 set_sgml_parser(Parser, xmlns(URI)).
519def_entity(xmlns(NS, URI), Parser) :-
520 set_sgml_parser(Parser, xmlns(NS, URI)).
521
525
526parser_meta_options([], _, []).
527parser_meta_options([call(When, Closure)|T0], M, [call(When, M:Closure)|T]) :-
528 !,
529 parser_meta_options(T0, M, T).
530parser_meta_options([H|T0], M, [H|T]) :-
531 parser_meta_options(T0, M, T).
532
533
537
538set_input_location(Parser, _In) :-
539 get_sgml_parser(Parser, file(_)),
540 !.
541set_input_location(Parser, In) :-
542 stream_property(In, file_name(File)),
543 !,
544 set_sgml_parser(Parser, file(File)),
545 stream_property(In, position(Pos)),
546 set_sgml_parser(Parser, position(Pos)).
547set_input_location(_, _).
548
549 552
559
560load_sgml_file(File, Term) :-
561 load_sgml(File, Term, []).
562
569
570load_xml_file(File, Term) :-
571 load_xml(File, Term, []).
572
579
580load_html_file(File, DOM) :-
581 load_html(File, DOM, []).
582
609
610load_html(File, Term, M:Options) :-
611 current_prolog_flag(html_dialect, Dialect),
612 dtd(Dialect, DTD),
613 merge_options(Options,
614 [ dtd(DTD),
615 dialect(Dialect),
616 max_errors(-1),
617 syntax_errors(quiet)
618 ], Options1),
619 load_structure(File, Term, M:Options1).
620
628
629load_xml(Input, DOM, M:Options) :-
630 merge_options(Options,
631 [ dialect(xml)
632 ], Options1),
633 load_structure(Input, DOM, M:Options1).
634
642
643load_sgml(Input, DOM, M:Options) :-
644 merge_options(Options,
645 [ dialect(sgml)
646 ], Options1),
647 load_structure(Input, DOM, M:Options1).
648
649
650
651 654
662
663xml_quote_attribute(In, Quoted) :-
664 xml_quote_attribute(In, Quoted, ascii).
665
666xml_quote_cdata(In, Quoted) :-
667 xml_quote_cdata(In, Quoted, ascii).
668
672
673xml_name(In) :-
674 xml_name(In, ascii).
675
676
677 680
692
693
694 697
702
703xml_is_dom(0) :- !, fail. 704xml_is_dom(List) :-
705 is_list(List),
706 !,
707 xml_is_content_list(List).
708xml_is_dom(Term) :-
709 xml_is_element(Term).
710
711xml_is_content_list([]).
712xml_is_content_list([H|T]) :-
713 xml_is_content(H),
714 xml_is_content_list(T).
715
716xml_is_content(0) :- !, fail.
717xml_is_content(pi(Pi)) :-
718 !,
719 atom(Pi).
720xml_is_content(CDATA) :-
721 atom(CDATA),
722 !.
723xml_is_content(CDATA) :-
724 string(CDATA),
725 !.
726xml_is_content(Term) :-
727 xml_is_element(Term).
728
729xml_is_element(element(Name, Attributes, Content)) :-
730 dom_name(Name),
731 dom_attributes(Attributes),
732 xml_is_content_list(Content).
733
734dom_name(NS:Local) :-
735 atom(NS),
736 atom(Local),
737 !.
738dom_name(Local) :-
739 atom(Local).
740
741dom_attributes(0) :- !, fail.
742dom_attributes([]).
743dom_attributes([H|T]) :-
744 dom_attribute(H),
745 dom_attributes(T).
746
747dom_attribute(Name=Value) :-
748 dom_name(Name),
749 atomic(Value).
750
751
752 755:- multifile
756 prolog:message/3. 757
759
760prolog:message(sgml(Parser, File, Line, Message)) -->
761 { get_sgml_parser(Parser, dialect(Dialect))
762 },
763 [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ].
764
765
766 769
770:- multifile
771 prolog:called_by/2. 772
773prolog:called_by(sgml_parse(_, Options), Called) :-
774 findall(Meta, meta_call_term(_, Meta, Options), Called).
775
776meta_call_term(T, G+N, Options) :-
777 T = call(Event, G),
778 pmember(T, Options),
779 call_params(Event, Term),
780 functor(Term, _, N).
781
782pmember(X, List) :- 783 nonvar(List),
784 List = [H|T],
785 ( X = H
786 ; pmember(X, T)
787 ).
788
789call_params(begin, begin(tag,attributes,parser)).
790call_params(end, end(tag,parser)).
791call_params(cdata, cdata(cdata,parser)).
792call_params(pi, pi(cdata,parser)).
793call_params(decl, decl(cdata,parser)).
794call_params(error, error(severity,message,parser)).
795call_params(xmlns, xmlns(namespace,url,parser)).
796call_params(urlns, urlns(url,url,parser)).
797
798 801
802:- multifile
803 sandbox:safe_primitive/1,
804 sandbox:safe_meta_predicate/1. 805
806sandbox:safe_meta_predicate(sgml:load_structure/3).
807sandbox:safe_primitive(sgml:dtd(Dialect, _)) :-
808 dtd_alias(Dialect, _).
809sandbox:safe_primitive(sgml:xml_quote_attribute(_,_,_)).
810sandbox:safe_primitive(sgml:xml_quote_cdata(_,_,_)).
811sandbox:safe_primitive(sgml:xml_name(_,_)).
812sandbox:safe_primitive(sgml:xml_basechar(_)).
813sandbox:safe_primitive(sgml:xml_ideographic(_)).
814sandbox:safe_primitive(sgml:xml_combining_char(_)).
815sandbox:safe_primitive(sgml:xml_digit(_)).
816sandbox:safe_primitive(sgml:xml_extender(_)).
817sandbox:safe_primitive(sgml:iri_xml_namespace(_,_,_)).
818sandbox:safe_primitive(sgml:xsd_number_string(_,_)).
819sandbox:safe_primitive(sgml:xsd_time_string(_,_,_))