csv.pl

View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2009-2018, VU University Amsterdam
    7                              CWI, Amsterdam
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(csv,
   37          [ csv//1,                     % +Rows
   38            csv//2,                     % +Rows, +Options
   39
   40            csv_read_file/2,            % +File, -Data
   41            csv_read_file/3,            % +File, -Data, +Options
   42            csv_read_stream/3,          % +Stream, -Data, +Options
   43
   44            csv_read_file_row/3,        % +File, -Row, +Options
   45            csv_read_row/3,		% +Stream, -Row, +CompiledOptions
   46            csv_options/2,		% -Compiled, +Options
   47
   48            csv_write_file/2,           % +File, +Data
   49            csv_write_file/3,           % +File, +Data, +Options
   50            csv_write_stream/3          % +Stream, +Data, +Options
   51          ]).   52:- use_module(library(record)).   53:- use_module(library(error)).   54:- use_module(library(pure_input)).   55:- use_module(library(debug)).   56:- use_module(library(option)).   57:- use_module(library(apply)).   58:- use_module(library(dcg/basics)).   59
   60/** <module> Process CSV (Comma-Separated Values) data
   61
   62This library parses and generates CSV data.   CSV data is represented in
   63Prolog as a list of rows. Each row   is  a compound term, where all rows
   64have the same name and arity.
   65
   66@tbd    Implement immediate assert of the data to avoid possible stack
   67        overflows.
   68@tbd    Writing creates an intermediate code-list, possibly overflowing
   69        resources.  This waits for pure output!
   70@see RFC 4180
   71*/
   72
   73:- predicate_options(csv//2, 2,
   74                     [ separator(nonneg),       % mustv be code
   75                       strip(boolean),
   76                       ignore_quotes(boolean),
   77                       convert(boolean),
   78                       case(oneof([down,preserve,up])),
   79                       functor(atom),
   80                       arity(-nonneg),          % actually ?nonneg
   81                       match_arity(boolean)
   82                     ]).   83:- predicate_options(csv_read_file/3, 3,
   84                     [ pass_to(csv//2, 2),
   85                       pass_to(phrase_from_file/3, 3)
   86                     ]).   87:- predicate_options(csv_read_file_row/3, 3,
   88                     [ pass_to(csv//2, 2),
   89                       pass_to(open/4, 4)
   90                     ]).   91:- predicate_options(csv_write_file/3, 3,
   92                     [ pass_to(csv//2, 2),
   93                       pass_to(open/4, 4)
   94                     ]).   95:- predicate_options(csv_write_stream/3, 3,
   96                     [ pass_to(csv//2, 2)
   97                     ]).   98
   99
  100:- record
  101    csv_options(separator:integer=0',,
  102                strip:boolean=false,
  103                ignore_quotes:boolean=false,
  104                convert:boolean=true,
  105                case:oneof([down,preserve,up])=preserve,
  106                functor:atom=row,
  107                arity:integer,
  108                match_arity:boolean=true,
  109                skip_header:atom).  110
  111
  112%!  csv_read_file(+File, -Rows) is det.
  113%!  csv_read_file(+File, -Rows, +Options) is det.
  114%
  115%   Read a CSV file into a list of   rows. Each row is a Prolog term
  116%   with the same arity. Options  is   handed  to  csv//2. Remaining
  117%   options  are  processed  by    phrase_from_file/3.  The  default
  118%   separator depends on the file name   extension and is =|\t|= for
  119%   =|.tsv|= files and =|,|= otherwise.
  120%
  121%   Suppose we want to create a predicate   table/6  from a CSV file
  122%   that we know contains 6 fields  per   record.  This  can be done
  123%   using the code below. Without the   option  arity(6), this would
  124%   generate a predicate table/N, where N   is  the number of fields
  125%   per record in the data.
  126%
  127%       ==
  128%       ?- csv_read_file(File, Rows, [functor(table), arity(6)]),
  129%          maplist(assert, Rows).
  130%       ==
  131
  132
  133csv_read_file(File, Rows) :-
  134    csv_read_file(File, Rows, []).
  135
  136csv_read_file(File, Rows, Options) :-
  137    default_separator(File, Options, Options1),
  138    make_csv_options(Options1, Record, RestOptions),
  139    phrase_from_file(csv_roptions(Rows, Record), File, RestOptions).
  140
  141
  142default_separator(File, Options0, Options) :-
  143    (   option(separator(_), Options0)
  144    ->  Options = Options0
  145    ;   file_name_extension(_, Ext0, File),
  146        downcase_atom(Ext0, Ext),
  147        ext_separator(Ext, Sep)
  148    ->  Options = [separator(Sep)|Options0]
  149    ;   Options = Options0
  150    ).
  151
  152ext_separator(csv, 0',).
  153ext_separator(tsv, 0'\t).
  154
  155
  156%!  csv_read_stream(+Stream, -Rows, +Options) is det.
  157%
  158%   Read CSV data from Stream.  See also csv_read_row/3.
  159
  160csv_read_stream(Stream, Rows, Options) :-
  161    make_csv_options(Options, Record, _),
  162    phrase_from_stream(csv_roptions(Rows, Record), Stream).
  163
  164
  165%!  csv(?Rows)// is det.
  166%!  csv(?Rows, +Options)// is det.
  167%
  168%   Prolog DCG to `read/write' CSV data.  Options:
  169%
  170%       * separator(+Code)
  171%       The comma-separator.  Must be a character code.  Default is
  172%       (of course) the comma. Character codes can be specified
  173%       using the 0' notion. E.g., using =|separator(0';)|= parses
  174%       a semicolon separated file.
  175%
  176%       * ignore_quotes(+Boolean)
  177%       If =true= (default false), threat double quotes as a normal
  178%       character.
  179%
  180%       * strip(+Boolean)
  181%       If =true= (default =false=), strip leading and trailing
  182%       blank space.  RFC4180 says that blank space is part of the
  183%       data.
  184%
  185%       * skip_header(+CommentLead)
  186%       Skip leading lines that start with CommentLead.  There is
  187%       no standard for comments in CSV files, but some CSV files
  188%       have a header where each line starts with `#`.  After
  189%       skipping comment lines this option causes csv//2 to skip empty
  190%       lines.  Note that an empty line may not contain white space
  191%       characters (space or tab) as these may provide valid data.
  192%
  193%       * convert(+Boolean)
  194%       If =true= (default), use name/2 on the field data.  This
  195%       translates the field into a number if possible.
  196%
  197%       * case(+Action)
  198%       If =down=, downcase atomic values.  If =up=, upcase them
  199%       and if =preserve= (default), do not change the case.
  200%
  201%       * functor(+Atom)
  202%       Functor to use for creating row terms.  Default is =row=.
  203%
  204%       * arity(?Arity)
  205%       Number of fields in each row.  This predicate raises
  206%       a domain_error(row_arity(Expected), Found) if a row is
  207%       found with different arity.
  208%
  209%       * match_arity(+Boolean)
  210%       If =false= (default =true=), do not reject CSV files where
  211%       lines provide a varying number of fields (columns).  This
  212%       can be a work-around to use some incorrect CSV files.
  213
  214csv(Rows) -->
  215    csv(Rows, []).
  216
  217csv(Rows, Options) -->
  218    { make_csv_options(Options, Record, _) },
  219    csv_roptions(Rows, Record).
  220
  221csv_roptions(Rows, Record) -->
  222    { ground(Rows) },
  223    !,
  224    emit_csv(Rows, Record).
  225csv_roptions(Rows, Record) -->
  226    skip_header(Record),
  227    csv_data(Rows, Record).
  228
  229skip_header(Options) -->
  230    { csv_options_skip_header(Options, CommentStart),
  231      nonvar(CommentStart),
  232      atom_codes(CommentStart, Codes)
  233    },
  234    !,
  235    skip_header_lines(Codes),
  236    skip_blank_lines.
  237skip_header(_) -->
  238    [].
  239
  240skip_header_lines(CommentStart) -->
  241    string(CommentStart),
  242    !,
  243    (   string(_Comment),
  244        end_of_record
  245    ->  skip_header_lines(CommentStart)
  246    ).
  247skip_header_lines(_) -->
  248    [].
  249
  250skip_blank_lines -->
  251    eos,
  252    !.
  253skip_blank_lines -->
  254    end_of_record,
  255    !,
  256    skip_blank_lines.
  257skip_blank_lines -->
  258    [].
  259
  260csv_data([], _) -->
  261    eos,
  262    !.
  263csv_data([Row|More], Options) -->
  264    row(Row, Options),
  265    !,
  266    { debug(csv, 'Row: ~p', [Row]) },
  267    csv_data(More, Options).
  268
  269
  270row(Row, Options) -->
  271    fields(Fields, Options),
  272    { csv_options_functor(Options, Functor),
  273      Row =.. [Functor|Fields],
  274      functor(Row, _, Arity),
  275      check_arity(Options, Arity)
  276    }.
  277
  278check_arity(Options, Arity) :-
  279    csv_options_arity(Options, Arity),
  280    !.
  281check_arity(Options, _) :-
  282    csv_options_match_arity(Options, false),
  283    !.
  284check_arity(Options, Arity) :-
  285    csv_options_arity(Options, Expected),
  286    domain_error(row_arity(Expected), Arity).
  287
  288fields([F|T], Options) -->
  289    field(F, Options),
  290    (   separator(Options)
  291    ->  fields(T, Options)
  292    ;   end_of_record
  293    ->  { T = [] }
  294    ).
  295
  296field(Value, Options) -->
  297    "\"",
  298    { csv_options_ignore_quotes(Options, false) },
  299    !,
  300    string_codes(Codes),
  301    { make_value(Codes, Value, Options) }.
  302field(Value, Options) -->
  303    { csv_options_strip(Options, true) },
  304    !,
  305    stripped_field(Value, Options).
  306field(Value, Options) -->
  307    { csv_options_separator(Options, Sep) },
  308    field_codes(Codes, Sep),
  309    { make_value(Codes, Value, Options) }.
  310
  311
  312stripped_field(Value, Options) -->
  313    ws,
  314    (   "\"",
  315        { csv_options_strip(Options, false) }
  316    ->  string_codes(Codes),
  317        ws
  318    ;   { csv_options_separator(Options, Sep) },
  319        field_codes(Codes0, Sep),
  320        { strip_trailing_ws(Codes0, Codes) }
  321    ),
  322    { make_value(Codes, Value, Options) }.
  323
  324ws --> " ", !, ws.
  325ws --> "\t", !, ws.
  326ws --> "".
  327
  328strip_trailing_ws(List, Stripped) :-
  329    append(Stripped, WS, List),
  330    all_ws(WS).
  331
  332all_ws([]).
  333all_ws([32|T]) :- all_ws(T).
  334all_ws([9|T]) :- all_ws(T).
  335
  336
  337%!  string_codes(-Codes)
  338%
  339%   Process a double-quotes string where  the   quote  is escaped by
  340%   doubling it. Eats the terminating double-quote.
  341
  342string_codes(List) -->
  343    [H],
  344    (   { H == 0'" }
  345    ->  (   "\""
  346        ->  { List = [H|T] },
  347            string_codes(T)
  348        ;   { List = [] }
  349        )
  350    ;   { List = [H|T] },
  351        string_codes(T)
  352    ).
  353
  354field_codes([], Sep), [Sep] --> [Sep], !.
  355field_codes([], _), "\n" --> "\r\n", !.
  356field_codes([], _), "\n" --> "\n", !.
  357field_codes([], _), "\n" --> "\r", !.
  358field_codes([H|T], Sep) --> [H], !, field_codes(T, Sep).
  359field_codes([], _) --> [].              % unterminated last record
  360
  361%!  make_value(+Codes, -Value, +Options) is det.
  362%
  363%   Convert a list of character codes to the actual value, depending
  364%   on Options.
  365
  366make_value(Codes, Value, Options) :-
  367    csv_options_convert(Options, Convert),
  368    csv_options_case(Options, Case),
  369    make_value(Convert, Case, Codes, Value).
  370
  371make_value(true, preserve, Codes, Value) :-
  372    !,
  373    name(Value, Codes).
  374make_value(true, Case, Codes, Value) :-
  375    !,
  376    (   number_string(Value, Codes)
  377    ->  true
  378    ;   make_value(false, Case, Codes, Value)
  379    ).
  380make_value(false, preserve, Codes, Value) :-
  381    !,
  382    atom_codes(Value, Codes).
  383make_value(false, down, Codes, Value) :-
  384    !,
  385    string_codes(String, Codes),
  386    downcase_atom(String, Value).
  387make_value(false, up, Codes, Value) :-
  388    string_codes(String, Codes),
  389    upcase_atom(String, Value).
  390
  391separator(Options) -->
  392    { csv_options_separator(Options, Sep) },
  393    [Sep].
  394
  395end_of_record --> "\n".			% Unix files
  396end_of_record --> "\r\n".               % DOS files
  397end_of_record --> "\r".                 % MacOS files
  398end_of_record --> eos.                  % unterminated last record
  399
  400
  401%!  csv_read_file_row(+File, -Row, +Options) is nondet.
  402%
  403%   True when Row is a row in File.  First unifies Row with the first
  404%   row in File. Backtracking  yields  the   second,  ...  row.  This
  405%   interface  is  an  alternative  to  csv_read_file/3  that  avoids
  406%   loading all rows in memory.  Note   that  this interface does not
  407%   guarantee that all rows in File have the same arity.
  408%
  409%   In addition to the  options   of  csv_read_file/3, this predicate
  410%   processes the option:
  411%
  412%     * line(-Line)
  413%     Line is unified with the 1-based line-number from which Row is
  414%     read.  Note that Line is not the physical line, but rather the
  415%     _logical_ record number.
  416%
  417%   @tbd    Input is read line by line.  If a record separator is
  418%           embedded in a quoted field, parsing the record fails and
  419%           another line is added to the input.  This does not nicely
  420%           deal with other reasons why parsing the row may fail.
  421
  422csv_read_file_row(File, Row, Options) :-
  423    default_separator(File, Options, Options1),
  424    make_csv_options(Options1, RecordOptions, Options2),
  425    select_option(line(Line), Options2, RestOptions, _),
  426    setup_call_cleanup(
  427        open(File, read, Stream, RestOptions),
  428        csv_read_stream_row(Stream, Row, Line, RecordOptions),
  429        close(Stream)).
  430
  431csv_read_stream_row(Stream, Row, Line, Options) :-
  432    between(1, infinite, Line),
  433    (   csv_read_row(Stream, Row0, Options),
  434        Row0 \== end_of_file
  435    ->  Row = Row0
  436    ;   !,
  437        fail
  438    ).
  439
  440
  441%!  csv_read_row(+Stream, -Row, +CompiledOptions) is det.
  442%
  443%   Read the next CSV record from Stream  and unify the result with Row.
  444%   CompiledOptions is created from  options   defined  for csv//2 using
  445%   csv_options/2. Row is unified with   `end_of_file` upon reaching the
  446%   end of the input.
  447
  448csv_read_row(Stream, Row, _Record) :-
  449    at_end_of_stream(Stream),
  450    !,
  451    Row = end_of_file.
  452csv_read_row(Stream, Row, Record) :-
  453    read_lines_to_codes(Stream, Codes, Record, even),
  454    phrase(row(Row0, Record), Codes),
  455    !,
  456    Row = Row0.
  457
  458read_lines_to_codes(Stream, Codes, Options, QuoteQuantity) :-
  459    read_line_to_codes(Stream, Codes0),
  460    Codes0 \== end_of_file,
  461    (   (   csv_options_ignore_quotes(Options, true)
  462        ;   check_quotes(Codes0, QuoteQuantity, even)
  463        )
  464    ->  Codes = Codes0
  465    ;   append(Codes0, [0'\n|Tail], Codes),
  466        read_lines_to_codes(Stream, Tail, Options, odd)
  467    ).
  468
  469check_quotes([], QuoteQuantity, QuoteQuantity) :-
  470    !.
  471check_quotes([0'"|T], odd, Result) :-
  472    !,
  473    check_quotes(T, even, Result).
  474check_quotes([0'"|T], even, Result) :-
  475    !,
  476    check_quotes(T, odd, Result).
  477check_quotes([_|T], QuoteQuantity, Result) :-
  478    check_quotes(T, QuoteQuantity, Result).
  479
  480
  481%!  csv_options(-Compiled, +Options) is det.
  482%
  483%   Compiled is the  compiled  representation   of  the  CSV  processing
  484%   options as they may be passed into   csv//2,  etc. This predicate is
  485%   used in combination with csv_read_row/3 to avoid repeated processing
  486%   of the options.
  487
  488csv_options(Compiled, Options) :-
  489    make_csv_options(Options, Compiled, _Ignored).
  490
  491
  492                /*******************************
  493                *             OUTPUT           *
  494                *******************************/
  495
  496%!  csv_write_file(+File, +Data) is det.
  497%!  csv_write_file(+File, +Data, +Options) is det.
  498%
  499%   Write a list of Prolog terms to a CSV file.  Options are given
  500%   to csv//2.  Remaining options are given to open/4.  The  default
  501%   separator depends on the file name   extension and is =|\t|= for
  502%   =|.tsv|= files and =|,|= otherwise.
  503
  504csv_write_file(File, Data) :-
  505    csv_write_file(File, Data, []).
  506
  507csv_write_file(File, Data, Options) :-
  508    must_be(list, Data),
  509    default_separator(File, Options, Options1),
  510    make_csv_options(Options1, OptionsRecord, RestOptions),
  511    setup_call_cleanup(
  512        open(File, write, Out, RestOptions),
  513        maplist(csv_write_row(Out, OptionsRecord), Data),
  514        close(Out)).
  515
  516csv_write_row(Out, OptionsRecord, Row) :-
  517    phrase(emit_row(Row, OptionsRecord), String),
  518    format(Out, '~s', [String]).
  519
  520emit_csv([], _) --> [].
  521emit_csv([H|T], Options) -->
  522    emit_row(H, Options),
  523    emit_csv(T, Options).
  524
  525emit_row(Row, Options) -->
  526    { Row =.. [_|Fields] },
  527    emit_fields(Fields, Options),
  528    "\r\n".                                     % RFC 4180 demands \r\n
  529
  530emit_fields([], _) -->
  531    "".
  532emit_fields([H|T], Options) -->
  533    emit_field(H, Options),
  534    (   { T == [] }
  535        ->  []
  536        ;   { csv_options_separator(Options, Sep) },
  537        [Sep],
  538        emit_fields(T, Options)
  539    ).
  540
  541emit_field(H, Options) -->
  542    { (   atom(H)
  543      ->  atom_codes(H, Codes)
  544      ;   string(H)
  545      ->  string_codes(H, Codes)
  546      )
  547    },
  548    !,
  549    (   { needs_quotes(H, Options) }
  550    ->  "\"", emit_string(Codes), "\""
  551    ;   emit_codes(Codes)
  552    ).
  553emit_field([], _) -->
  554    !,
  555    { atom_codes('[]', Codes) },
  556    emit_codes(Codes).
  557emit_field(H, _) -->
  558    { number_codes(H,Codes) },
  559    emit_codes(Codes).
  560
  561needs_quotes(Atom, _) :-
  562    sub_atom(Atom, _, _, _, '"'),
  563    !.
  564needs_quotes(Atom, _) :-
  565    sub_atom(Atom, _, _, _, '\n'),
  566    !.
  567needs_quotes(Atom, _) :-
  568    sub_atom(Atom, _, _, _, '\r'),
  569    !.
  570needs_quotes(Atom, Options) :-
  571    csv_options_separator(Options, Sep),
  572    char_code(Char, Sep),
  573    sub_atom(Atom, _, _, _, Char),
  574    !.
  575
  576emit_string([]) --> "".
  577emit_string([0'"|T]) --> !, "\"\"", emit_string(T).
  578emit_string([H|T]) --> [H], emit_string(T).
  579
  580emit_codes([]) --> "".
  581emit_codes([0'"|T]) --> !, "\"\"", emit_codes(T).
  582emit_codes([H|T]) --> [H], emit_codes(T).
  583
  584
  585%%     csv_write_stream(+Stream, +Data, +Options) is det.
  586%
  587%      Write  the  rows  in  Data  to    Stream.   This  is  similar  to
  588%      csv_write_file/3,  but  can  deal  with  data  that  is  produced
  589%      incrementally. The example  below  saves   all  answers  from the
  590%      predicate data/3 to File.
  591%
  592%        ==
  593%        save_data(File) :-
  594%           setup_call_cleanup(
  595%               open(File, write, Out),
  596%               forall(data(C1,C2,C3),
  597%                      csv_write_stream(Out, [row(C1,C2,C3)], [])),
  598%               close(Out)),
  599%        ==
  600
  601csv_write_stream(Stream, Data, Options) :-
  602    must_be(list, Data),
  603    make_csv_options(Options, OptionsRecord, _),
  604    maplist(csv_write_row(Stream, OptionsRecord), Data)