34
35
36:- module(html_text,
37 [ html_text/1, 38 html_text/2 39 ]). 40:- use_module(library(sgml)). 41:- use_module(library(lists)). 42:- use_module(library(debug)). 43:- use_module(library(apply)). 44:- use_module(library(option)). 45:- use_module(library(aggregate)). 46
47:- use_module(library(lynx/format)). 48:- use_module(library(lynx/html_style)).
64html_text(Input) :-
65 html_text(Input, []).
66
67html_text(Input, Options) :-
68 ( xml_is_dom(Input)
69 -> DOM = Input
70 ; load_html(Input, DOM, Options)
71 ),
72 default_state(State0),
73 state_options(Options, State0, State),
74 init_nl,
75 format_dom(DOM, State).
76
77state_options([], State, State).
78state_options([H|T], State0, State) :-
79 H =.. [Key,Value],
80 ( fmt_option(Key, Type, _Default)
81 -> must_be(Type, Value),
82 State1 = State0.put(Key,Value)
83 ; State1 = State0
84 ),
85 state_options(T, State1, State).
86
87fmt_option(margin_left, integer, 0).
88fmt_option(margin_right, integer, 0).
89fmt_option(text_align, oneof([justify, left]), justify).
90fmt_option(width, between(10,1000), 72).
91
92default_state(State) :-
93 findall(Key-Value, fmt_option(Key, _, Value), Pairs),
94 dict_pairs(Dict, _, Pairs),
95 State = Dict.put(_{ style:[], list:[]}).
101format_dom([], _) :-
102 !.
103format_dom([H|T], State) :-
104 format_dom(H, State),
105 !,
106 format_dom(T, State).
107format_dom(Content, State) :-
108 Content = [H0|_],
109 \+ is_block_element(H0),
110 !,
111 ( append(Inline, [H|T], Content),
112 is_block_element(H)
113 -> true
114 ; Inline = Content
115 ),
116 format_dom(element(p, [], Inline), State),
117 format_dom([H|T], State).
118format_dom(element(html, _, Content), State) :-
119 !,
120 format_dom(Content, State).
121format_dom(element(head, _, _), _) :-
122 !.
123format_dom(element(body, _, Content), State) :-
124 !,
125 format_dom(Content, State).
126format_dom(element(E, Attrs, Content), State) :-
127 !,
128 ( format_element(E, Attrs, Content, State)
129 -> true
130 ; debug(format(html), 'Skipped block element ~q', [E])
131 ).
132
133format_element(pre, Attrs, [Content], State) :-
134 !,
135 block_element(pre, Attrs, Top-Bottom, BlockAttrs, Style),
136 update_style(Style, State, State1),
137 ask_nl(Top),
138 emit_code(Content, BlockAttrs, State1),
139 ask_nl(Bottom).
140format_element(table, Attrs, Content, State) :-
141 !,
142 block_element(table, Attrs, Top-Bottom, BlockAttrs, Style),
143 update_style(Style, State, State1),
144 state_par_properties(State1, BlockAttrs, BlockOptions),
145 ask_nl(Top),
146 emit_nl,
147 format_table(Content, Attrs, BlockOptions, State1),
148 ask_nl(Bottom).
149format_element(hr, Attrs, _, State) :-
150 !,
151 block_element(hr, Attrs, Top-Bottom, BlockAttrs, Style),
152 update_style(Style, State, State1),
153 state_par_properties(State1, BlockAttrs, BlockOptions),
154 ask_nl(Top),
155 emit_nl,
156 emit_hr(Attrs, BlockOptions, State1),
157 ask_nl(Bottom).
158format_element(Elem, Attrs, Content, State) :-
159 block_element(Elem, Attrs, Top-Bottom, BlockAttrs, Style),
160 !,
161 update_style(Style, State, State1),
162 block_words(Content, SubBlocks, Words, State1),
163 ( Words == []
164 -> true
165 ; ask_nl(Top),
166 emit_block(Words, BlockAttrs, State1),
167 ask_nl(Bottom)
168 ),
169 ( SubBlocks \== []
170 -> update_state_par_properties(BlockAttrs, State1, State2),
171 format_dom(SubBlocks, State2)
172 ; true
173 ).
174format_element(Elem, Attrs, Content, State) :-
175 list_element(Elem, Attrs, Top-Bottom, State, State1),
176 !,
177 open_list(Elem, State1, State2),
178 ask_nl(Top),
179 format_list(Content, Elem, 1, State2),
180 ask_nl(Bottom).
181format_element(Elem, Attrs, Content, State) :-
182 format_list_element(element(Elem, Attrs, Content), none, 0, State).
188block_element(El, Attrs, Margins, ParOptions, Style) :-
189 block_element(El, Margins0, ParOptions0, Style0),
190 ( nonvar(Attrs),
191 element_css(El, Attrs, CSS)
192 -> css_block_options(CSS, Margins0, Margins, ParOptions, Style1),
193 append(Style1, Style0, Style2),
194 list_to_set(Style2, Style)
195 ; Margins = Margins0,
196 ParOptions = ParOptions0,
197 Style = Style0
198 ).
199
200block_element(p, 1-2, [], []).
201block_element(div, 1-1, [], []).
202block_element(hr, 1-1, [], []).
203block_element(h1, 2-2, [], [bold]).
204block_element(h2, 2-2, [], [bold]).
205block_element(h3, 2-2, [], [bold]).
206block_element(h4, 2-2, [], [bold]).
207block_element(pre, 2-2, [], []).
208block_element(blockquote, 2-2, [margin_left(4), margin_right(4)], []).
209block_element(table, 2-2, [], []).
210
211list_element(ul, _, Margins, State0, State) :-
212 margins(4, 4, State0, State),
213 list_level_margins(State, Margins).
214list_element(ol, _, Margins, State0, State) :-
215 margins(4, 4, State0, State),
216 list_level_margins(State, Margins).
217list_element(dl, _, 2-2, State, State).
218
219list_element(ul).
220list_element(ol).
221list_element(dl).
222
223list_level_margins(State, 2-2) :-
224 nonvar(State),
225 State.get(list) == [],
226 !.
227list_level_margins(_, 0-0).
228
229format_list([], _, _, _).
230format_list([H|T], Type, Nth, State) :-
231 format_list_element(H, Type, Nth, State),
232 ( T == []
233 -> true
234 ; Nth1 is Nth + 1,
235 format_list(T, Type, Nth1, State)
236 ).
237
238format_list_element(element(LE, Attrs, Content), Type, Nth, State) :-
239 setup_list_element(LE, Attrs, Type, Nth, ListParProps, State, State1),
240 block_words(Content, Blocks, Words, State1),
241 emit_block(Words, ListParProps, State1),
242 ( Blocks \== []
243 -> update_state_par_properties(ListParProps, State1, State2),
244 format_dom(Blocks, State2)
245 ; true
246 ).
247
248setup_list_element(li, _Attrs, _Type, Nth, ListParProps, State, State) :-
249 list_par_properties(State.list, Nth, ListParProps).
250setup_list_element(dt, _Attrs, _Type, _Nth, [], State, State2) :-
251 margins(0, 0, State, State1),
252 update_style([bold], State1, State2).
253setup_list_element(dd, _Attrs, _Type, _Nth, [], State, State1) :-
254 margins(4, 0, State, State1).
255
256list_item_element(li).
257list_item_element(dt).
258list_item_element(dd).
259
260list_par_properties([ul|_More], _, [bullet('\u2022')]).
261list_par_properties([ol|_More], N, [bullet(N)]).
268block_words(Content, RC, Words, State) :-
269 phrase(bwords(Content, RC, State), Words0),
270 join_whitespace(Words0, Words1),
271 text_format:trim_spaces(Words1, Words).
272
273bwords([], [], _) -->
274 !.
275bwords([H|T], Rest, _State) -->
276 { var(Rest),
277 is_block_element(H),
278 !,
279 Rest = [H|T]
280 }.
281bwords([H|T], Rest, State) -->
282 !,
283 bwordsel(H, State),
284 bwords(T, Rest, State).
285
286is_block_element(element(E,_,_)) :-
287 ( block_element(E, _, _, _)
288 ; list_element(E)
289 ; list_item_element(E)
290 ),
291 debug(format(html), 'Found block ~q', [E]),
292 !.
293
294bwordsel(element(Elem, Attrs, Content), State) -->
295 { styled_inline(Elem, Attrs, Margins, Style),
296 !,
297 update_style(Style, State, State1)
298 },
299 left_margin(Margins),
300 bwords(Content, [], State1),
301 right_margin(Margins).
302bwordsel(element(br, _, _), _State) -->
303 [br([])].
304bwordsel(CDATA, State) -->
305 { atomic(CDATA),
306 !,
307 split_string(CDATA, " \n\t\r", "", Words)
308 },
309 words(Words, State).
310bwordsel(element(Elem, _Attrs, _Content), _State) -->
311 { debug(format(html), 'Skipped inline element ~q', [Elem]) }.
312
313left_margin(0-_) --> !.
314left_margin(N-_) --> [b(N,_)].
315
316right_margin(_-0) --> !.
317right_margin(_-N) --> [b(N,_)].
318
319styled_inline(El, Attrs, Margins, Style) :-
320 styled_inline(El, Style0),
321 ( nonvar(Attrs),
322 element_css(El, Attrs, CSS)
323 -> css_inline_options(CSS, Margins, Style1),
324 append(Style1, Style0, Style2),
325 list_to_set(Style2, Style)
326 ; Style = Style0
327 ).
328
329styled_inline(b, [bold]).
330styled_inline(strong, [bold]).
331styled_inline(em, [bold]).
332styled_inline(span, []).
333styled_inline(i, [underline]).
334styled_inline(a, [underline]).
335styled_inline(var, []).
336styled_inline(code, []).
343words([], _) --> [].
344words([""|T0], State) -->
345 !,
346 { skip_leading_spaces(T0, T) },
347 space,
348 words(T, State).
349words([H|T], State) -->
350 word(H, State),
351 ( {T==[]}
352 -> []
353 ; { skip_leading_spaces(T, T1) },
354 space,
355 words(T1, State)
356 ).
357
358skip_leading_spaces([""|T0], T) :-
359 !,
360 skip_leading_spaces(T0, T).
361skip_leading_spaces(L, L).
362
363word(W, State) -->
364 { string_length(W, Len),
365 ( Style = State.get(style)
366 -> true
367 ; Style = []
368 )
369 },
370 [w(W, Len, Style)].
371
372space -->
373 [b(1,_)].
379join_whitespace([], []).
380join_whitespace([H0|T0], [H|T]) :-
381 join_whitespace(H0, H, T0, T1),
382 !,
383 join_whitespace(T1, T).
384join_whitespace([H|T0], [H|T]) :-
385 join_whitespace(T0, T).
386
387join_whitespace(b(Len0,_), b(Len,_), T0, T) :-
388 take_whitespace(T0, T, Len0, Len).
389
390take_whitespace([b(Len1,_)|T0], T, Len0, Len) :-
391 !,
392 Len2 is max(Len1,Len0),
393 take_whitespace(T0, T, Len2, Len).
394take_whitespace(L, L, Len, Len).
395
396
397
405update_style([], State, State) :-
406 !.
407update_style(Extra, State0, State) :-
408 ( get_dict(style, State0, Style0, State, Style)
409 -> add_style(Extra, Style0, Style)
410 ; add_style(Extra, [], Style),
411 put_dict(style, State0, Style, State)
412 ).
413
414add_style(Extra, Style0, Style) :-
415 reverse(Extra, RevExtra),
416 foldl(add1_style, RevExtra, Style0, Style).
422add1_style(New, Style0, Style) :-
423 ( style_overrides(New, Add, Overrides)
424 -> delete_all(Overrides, Style0, Style1),
425 append(Add, Style1, Style)
426 ; Style = [New|Style0]
427 ).
428
429delete_all([], List, List).
430delete_all([H|T], List0, List) :-
431 delete(List0, H, List1),
432 delete_all(T, List1, List).
433
434style_overrides(normal, [], [bold]).
435style_overrides(fg(C), [fg(C)], [fg(_), hfg(_)]).
436style_overrides(bg(C), [bg(C)], [bg(_), hbg(_)]).
437style_overrides(underline(false), [], [underline]).
438
439margins(Left, Right, State0, State) :-
440 _{ margin_left:ML0, margin_right:MR0 } >:< State0,
441 ML is ML0 + Left,
442 MR is MR0 + Right,
443 State = State0.put(_{margin_left:ML, margin_right:MR}).
444
445open_list(Type, State0, State) :-
446 get_dict(list, State0, Lists, State, [Type|Lists]).
447
448update_state_par_properties([], State, State).
449update_state_par_properties([H|T], State0, State) :-
450 H =.. [ Key, Value ],
451 State1 = State0.put(Key,Value),
452 update_state_par_properties(T, State1, State).
459state_par_properties(State, Props) :-
460 Props0 = [ margin_left(LM),
461 margin_right(RM),
462 text_align(TA),
463 width(W),
464 pad(Pad)
465 ],
466 _{margin_left:LM, margin_right:RM, text_align:TA, width:W,
467 pad:Pad} >:< State,
468 filled_par_props(Props0, Props).
469
470filled_par_props([], []).
471filled_par_props([H|T0], [H|T]) :-
472 arg(1, H, A),
473 nonvar(A),
474 !,
475 filled_par_props(T0, T).
476filled_par_props([_|T0], T) :-
477 filled_par_props(T0, T).
478
479
480state_par_properties(State, Options, BlockOptions) :-
481 state_par_properties(State, Options0),
482 foldl(merge_par_option, Options, Options0, BlockOptions).
483
484merge_par_option(margin_left(ML0), Options0, [margin_left(ML)|Options1]) :-
485 !,
486 select_option(margin_left(ML1), Options0, Options1, 0),
487 ML is ML0+ML1.
488merge_par_option(margin_right(MR0), Options0, [margin_right(MR)|Options1]) :-
489 !,
490 select_option(margin_right(MR1), Options0, Options1, 0),
491 MR is MR0+MR1.
492merge_par_option(Opt, Options0, Options) :-
493 merge_options([Opt], Options0, Options).
501emit_block([], _, _) :-
502 !.
503emit_block(Words, Options, State) :-
504 state_par_properties(State, Options, BlockOptions),
505 ask_nl(1),
506 emit_nl,
507 format_paragraph(Words, BlockOptions),
508 ask_nl(1).
516init_nl :-
517 nb_setval(nl_pending, start).
518
519init_nl(Old) :-
520 ( nb_current(nl_pending, Old)
521 -> true
522 ; Old = []
523 ),
524 nb_setval(nl_pending, start).
525exit_nl(Old) :-
526 nb_setval(nl_pending, Old).
527
528ask_nl(N) :-
529 ( nb_current(nl_pending, N0)
530 -> ( N0 == start
531 -> true
532 ; integer(N0)
533 -> N1 is max(N0, N),
534 nb_setval(nl_pending, N1)
535 ; nb_setval(nl_pending, N)
536 )
537 ; nb_setval(nl_pending, N)
538 ).
539
540emit_nl :-
541 ( nb_current(nl_pending, N),
542 integer(N)
543 -> forall(between(1,N,_), nl)
544 ; true
545 ),
546 nb_setval(nl_pending, 0).
547
548
549
555emit_code(Content, BlockAttrs, State) :-
556 Style = State.style,
557 split_string(Content, "\n", "", Lines),
558 option(margin_left(LM0), BlockAttrs, 4),
559 LM is LM0+State.margin_left,
560 ask_nl(1),
561 emit_nl,
562 emit_code_lines(Lines, 1, LM, Style),
563 ask_nl(1).
564
565emit_code_lines([], _, _, _).
566emit_code_lines([H|T], LineNo, LM, Style) :-
567 emit_code_line(H, LineNo, LM, Style),
568 LineNo1 is LineNo + 1,
569 emit_code_lines(T, LineNo1, LM, Style).
570
571emit_code_line(Line, _LineNo, LM, Style) :-
572 emit_nl,
573 emit_indent(LM),
574 ( Style == []
575 -> write(Line)
576 ; ansi_format(Style, '~s', [Line])
577 ),
578 ask_nl(1).
579
580emit_indent(N) :-
581 forall(between(1, N, _),
582 put_char(' ')).
583
584
585
591format_table(Content, Attrs, BlockAttrs, State) :-
592 tty_state(TTY),
593 option(margin_left(ML), BlockAttrs, 0),
594 option(margin_right(MR), BlockAttrs, 0),
595 MaxTableWidth is State.width - ML - MR,
596 table_cell_state(Attrs, State, CellState),
597 phrase(rows(Content), Rows),
598 columns(Rows, Columns),
599 maplist(auto_column_width(CellState.put(tty,false)), Columns, Widths),
600 column_widths(Widths, MaxTableWidth, ColWidths),
601 maplist(format_row(ColWidths, CellState.put(tty,TTY), ML), Rows).
602
603tty_state(TTY) :-
604 stream_property(current_output, tty(true)),
605 !,
606 TTY = true.
607tty_state(false).
615column_widths(Widths, MaxTableWidth, Widths) :-
616 sum_list(Widths, AutoWidth),
617 AutoWidth =< MaxTableWidth,
618 !.
619column_widths(AutoWidths, MaxTableWidth, Widths) :-
620 sort(0, >=, AutoWidths, Sorted),
621 append(Wrapped, Keep, Sorted),
622 sum_list(Keep, KeepWidth),
623 KeepWidth < MaxTableWidth/2,
624 length(Wrapped, NWrapped),
625 WideWidth is round((MaxTableWidth-KeepWidth)/NWrapped),
626 ( [KeepW|_] = Keep
627 -> true
628 ; KeepW = 0
629 ),
630 !,
631 maplist(truncate_column(KeepW,WideWidth), AutoWidths, Widths).
632
633truncate_column(Keep, WideWidth, AutoWidth, Width) :-
634 ( AutoWidth =< Keep
635 -> Width = AutoWidth
636 ; Width = WideWidth
637 ).
638
639table_cell_state(Attrs, State, CellState) :-
640 ( element_css(table, Attrs, CSS)
641 -> true
642 ; CSS = []
643 ),
644 option(padding_left(PL), CSS, 1),
645 option(padding_right(PR), CSS, 1),
646 CellState = State.put(_{margin_left:PL, margin_right:PR}).
651rows([]) --> [].
652rows([H|T]) --> rows(H), rows(T).
653rows([element(tbody,_,Content)|T]) --> rows(Content), rows(T).
654rows([element(tr,Attrs,Columns)|T]) --> [row(Columns, Attrs)], rows(T).
661columns(Rows, Columns) :-
662 columns(Rows, 1, Columns).
663
664columns(Rows, I, Columns) :-
665 maplist(row_column(I, Found), Rows, H),
666 ( Found == true
667 -> Columns = [H|T],
668 I2 is I + 1,
669 columns(Rows, I2, T)
670 ; Columns = []
671 ).
672
673row_column(I, Found, row(Columns, _Attrs), Cell) :-
674 ( nth1(I, Columns, Cell)
675 -> Found = true
676 ; Cell = element(td,[],[])
677 ).
678
679auto_column_width(State, Col, Width) :-
680 maplist(auto_cell_width(State), Col, Widths),
681 max_list(Widths, Width).
682
683auto_cell_width(State, Cell, Width) :-
684 cell_colspan(Cell, 1),
685 !,
686 format_cell_to_string(Cell, 1_000, State, String),
687 split_string(String, "\n", "", Lines),
688 maplist(string_length, Lines, LineW),
689 max_list(LineW, Width0),
690 Width is Width0 + State.margin_right.
691auto_cell_width(_, _, 0).
697format_row(ColWidths, State, MarginLeft, Row) :-
698 hrule(Row, ColWidths, MarginLeft),
699 format_cells(ColWidths, CWSpanned, 1, Row, State, Cells),
700 format_row_lines(1, CWSpanned, Cells, MarginLeft).
701
702hrule(row(_, Attrs), ColWidths, MarginLeft) :-
703 attrs_classes(Attrs, Classes),
704 memberchk(hline, Classes),
705 !,
706 sum_list(ColWidths, RuleLen),
707 format('~N~t~*|~`-t~*+', [MarginLeft, RuleLen]).
708hrule(_, _, _).
709
710format_row_lines(LineNo, Widths, Cells, MarginLeft) :-
711 nth_row_line(Widths, 1, LineNo, Cells, CellLines, Found),
712 ( Found == true
713 -> emit_nl,
714 emit_indent(MarginLeft),
715 maplist(emit_cell_line, CellLines),
716 ask_nl(1),
717 LineNo1 is LineNo + 1,
718 format_row_lines(LineNo1, Widths, Cells, MarginLeft)
719 ; true
720 ).
721
722emit_cell_line(Line-Pad) :-
723 write(Line),
724 forall(between(1,Pad,_), put_char(' ')).
725
726nth_row_line([], _, _, _, [], _).
727nth_row_line([ColW|CWT], CellNo, LineNo, Cells, [CellLine-Pad|ColLines],
728 Found) :-
729 nth1(CellNo, Cells, CellLines),
730 ( nth1(LineNo, CellLines, CellLine)
731 -> Found = true,
732 Pad = 0
733 ; CellLine = '', Pad = ColW
734 ),
735 CellNo1 is CellNo + 1,
736 nth_row_line(CWT, CellNo1, LineNo, Cells, ColLines, Found).
745format_cells([], [], _, _, _, []) :- !.
746format_cells(CWidths, [HW|TW], Column, Row, State, [HC|TC]) :-
747 Row = row(Columns, _Attrs),
748 nth1(Column, Columns, Cell),
749 cell_colspan(Cell, CWidths, HW, TW0),
750 cell_align(Cell, Align),
751 format_cell_to_string(Cell, HW, State.put(_{pad:' ', text_align:Align}), String),
752 split_string(String, "\n", "", HC),
753 Column1 is Column+1,
754 format_cells(TW0, TW, Column1, Row, State, TC).
755
756cell_colspan(Cell, CWidths, HW, TW) :-
757 cell_colspan(Cell, Span),
758 length(SpanW, Span),
759 append(SpanW, TW, CWidths),
760 sum_list(SpanW, HW).
761
762cell_colspan(element(_,Attrs,_), Span) :-
763 ( memberchk(colspan=SpanA, Attrs),
764 atom_number(SpanA, SpanN)
765 -> Span = SpanN
766 ; Span = 1
767 ).
775cell_align(element(_,Attrs,_), Align) :-
776 ( memberchk(align=AlignA, Attrs)
777 -> Align = AlignA
778 ; memberchk(style=Style, Attrs),
779 style_css_attrs(Style, Props),
780 memberchk('text-align'(AlignA), Props)
781 -> Align = AlignA
782 ; Align = left
783 ).
790format_cell_to_string(element(_,_,[]), ColWidth, State, String) :-
791 Pad = State.get(pad),
792 !,
793 length(Chars, ColWidth),
794 maplist(=(Pad), Chars),
795 atomics_to_string(Chars, String).
796format_cell_to_string(Cell, ColWidth, State, String) :-
797 setup_call_cleanup(
798 init_nl(NlState),
799 with_output_to(
800 string(String),
801 format_cell(Cell, ColWidth, State)),
802 exit_nl(NlState)).
803
804format_cell(element(E, _Attrs, Content), ColWidth, State) :-
805 set_stream(current_output, tty(State.tty)),
806 cell_element(E, Style),
807 update_style(Style, State.put(width, ColWidth), CellState),
808 block_words(Content, Blocks, Words, CellState),
809 emit_block(Words, [], CellState),
810 ( Blocks \== []
811 -> format_dom(Blocks, CellState)
812 ; true
813 ).
814
815cell_element(td, [normal]).
816cell_element(th, [bold]).
823emit_hr(_Attrs, BlockAttrs, State) :-
824 option(margin_left(ML), BlockAttrs, 0),
825 option(margin_right(MR), BlockAttrs, 0),
826 RuleWidth is State.width - ML - MR,
827 Style = State.style,
828 emit_indent(ML),
829 ( Style == []
830 -> format('~|~*t~*+', [0'-, RuleWidth])
831 ; ansi_format(Style, '~|~*t~*+', [0'-, RuleWidth])
832 )