summaryrefslogtreecommitdiffstats
path: root/src/lib/util/versioned_csv_file.h
blob: 4967d346e8e97e55bd9b07e83cafdd4846027cb7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
// Copyright (C) 2015 Internet Systems Consortium, Inc. ("ISC")
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

#ifndef VERSIONED_CSV_FILE_H
#define VERSIONED_CSV_FILE_H

#include <util/csv_file.h>

namespace isc {
namespace util {

/// @brief Exception thrown when an error occurs during CSV file processing.
class VersionedCSVFileError : public Exception {
public:
    VersionedCSVFileError(const char* file, size_t line, const char* what) :
        isc::Exception(file, line, what) { };
};

/// @brief Contains the metadata for a single column in a file.
class VersionedColumn {
public:
    /// @brief Constructor
    ///
    /// @param name Name of the column.
    /// @param version Text representation of the schema version in which
    /// this column first appeared.
    /// @param default_value The value the column should be assigned if it
    /// is not present in a data row. It defaults to an empty string, ""
    VersionedColumn(const std::string& name, const std::string& version,
               const std::string& default_value = "")
        : name_(name), version_(version), default_value_(default_value) {
    };

    /// @brief Destructor
    virtual ~VersionedColumn(){};

    /// @brief Name of the column.
    std::string name_;

    /// @brief Text representation of the schema version in which
    /// this column first appeared.
    std::string version_;

    /// @brief default_value The value the column should be assigned if it
    /// is not present in a data row.
    std::string default_value_;
};

/// @brief Defines a smart pointer to VersionedColumn
typedef boost::shared_ptr<VersionedColumn> VersionedColumnPtr;

/// @brief Implements a CSV file that supports multiple versions of
/// the file's "schema".  This allows files with older schemas to be
/// upgraded to newer schemas as they are being read.  The file's schema
/// is defined through a list of column descriptors, or @ref
/// isc::util::VersionedColumn(s). Each descriptor contains metadata describing
/// the column, consisting of the column's name, the version label in which
/// the column was added to the schema, and a default value to be used if the
/// column is missing from the file.  Note that the column descriptors are
/// defined in the order they occur in the file, when reading a row from left
/// to right.  This also assumes that when new version of the schema evolves,
/// all new columns are added at the end of the row.  In other words, the
/// order of the columns reflects not only the order in which they occur
/// in a row but also the order they were added to the schema.  Conceptually,
/// the entire list of columns defined constitutes the current schema.  Earlier
/// schema versions are therefore subsets of this list.   Creating the schema
/// is done by calling VersionedCSVfile::addColumn() for each column.  Note
/// that the schema must be defined prior to opening the file.
///
/// The first row of the file is always the header row and is a comma-separated
/// list of the names of the column in the file.  This row is used when
/// opening the file via @ref VersionedCSVFile::open(), to identify its schema
/// version so that it may be be read correctly.  This is done by comparing
/// the column found in the header to the columns defined in the schema. The
/// columns must match both by name and the order in which they occur.
///
/// -# If there are fewer columns in the header than in the schema, the file
/// is presumed to be an earlier schema version and will be upgraded as it is
/// read.  There is an ability to mark a specific column as being the minimum
/// column which must be present, see @ref VersionedCSVFile::setMinimumValidColumns().
/// If the header columns do not match up to this
/// minimum column, the file is presumed to be too old to upgrade and the
/// open will fail.  A valid, upgradable file will have an input schema
/// state of VersionedCSVFile::NEEDS_UPGRADE.
///
/// -# If there is a mismatch between a found column name and the column name
/// defined for that position in the row, the file is presumed to be invalid
/// and the open will fail.
///
/// -# If the content of the header matches exactly the columns defined in
/// the schema, the file is considered to match the schema exactly and the
/// input schema state will VersionedCSVFile::CURRENT.
///
/// -# If there columns in the header beyond all of the columns defined in
/// the schema (i.e the schema is a subset of the header), then the file
/// is presumed to be from a newer version of Kea and can be downgraded. The
/// input schema state fo the file will be set to
/// VersionedCSVFile::NEEDS_DOWNGRADE.
///
/// After successfully opening a file,  rows are read one at a time via
/// @ref VersionedCSVFile::next() and handled according to the input schema
/// state.   Each data row is expected to have at least the same number of
/// columns as were found in the header. Any row which as fewer values is
/// discarded as invalid.  Similarly, any row which is found to have more
/// values than were found in the header is discarded as invalid.
///
/// When upgrading a row, the values for each missing column is filled in
/// with the default value specified by that column's descriptor.  When
/// downgrading a row, extraneous values are dropped from the row.
///
/// It is important to note that upgrading or downgrading a file does NOT
/// alter the physical file itself.  Rather the conversion occurs after the
/// raw data has been read but before it is passed to caller.
///
/// Also note that there is currently no support for writing out a file in
/// anything other than the current schema.
class VersionedCSVFile : public CSVFile {
public:

    /// @brief Possible input file schema states.
    /// Used to categorize the input file's schema, relative to the defined
    /// schema.
    enum InputSchemaState {
        CURRENT,
        NEEDS_UPGRADE,
        NEEDS_DOWNGRADE
    };

    /// @brief Constructor.
    ///
    /// @param filename CSV file name.
    VersionedCSVFile(const std::string& filename);

    /// @brief Destructor
    virtual ~VersionedCSVFile();

    /// @brief Adds metadata for a single column to the schema.
    ///
    /// This method appends a new column description to the file's schema.
    /// Note this does not cause anything to be written to the physical file.
    /// The name of the column will be placed in the CSV header when new file
    /// is created by calling @c recreate or @c open function.
    ///
    /// @param col_name Name of the column.
    /// @param version  Text representation of the schema version in which
    /// this column first appeared.
    /// @param default_value value the missing column should be given during
    /// an upgrade.  It defaults to an empty string, ""
    ///
    /// @throw CSVFileError if a column with the specified name exists.
    void addColumn(const std::string& col_name, const std::string& version,
                   const std::string& default_value = "");

    /// @brief Sets the minimum number of valid columns based on a given column
    ///
    /// @param column_name Name of the column which positionally represents
    /// the minimum columns which must be present in a file and to be
    /// considered valid.
    void setMinimumValidColumns(const std::string& column_name);

    /// @brief Returns the minimum number of columns which must be present
    /// for the file to be considered valid.
    size_t getMinimumValidColumns() const;

    /// @brief Returns the number of columns found in the input header
    size_t getInputHeaderCount() const;

    /// @brief Returns the number of valid columns found in the header
    /// For newly created files this will always match the number of defined
    /// columns (i.e. getColumnCount()).  For existing files, this will be
    /// the number of columns in the header that match the defined columnns.
    /// When this number is less than getColumnCount() it means the input file
    /// is from an earlier schema.  This value is zero until the file has
    /// been opened.
    size_t getValidColumnCount() const;

    /// @brief Opens existing file or creates a new one.
    ///
    /// This function will try to open existing file if this file has size
    /// greater than 0. If the file doesn't exist or has size of 0, the
    /// file is recreated. If the existing file has been opened, the header
    /// is parsed and and validated against the schema.
    /// By default, the data pointer in the file is set to the beginning of
    /// the first data row. In order to retrieve the row contents the @c next
    /// function should be called. If a @c seek_to_end parameter is set to
    /// true, the file will be opened and the internal pointer will be set
    /// to the end of file.
    ///
    /// @param seek_to_end A boolean value which indicates if the intput and
    /// output file pointer should be set at the end of file.
    ///
    /// @throw VersionedCSVFileError if schema has not been defined,
    /// CSVFileError when IO operation fails, or header fails to validate.
    virtual void open(const bool seek_to_end = false);

    /// @brief Creates a new CSV file.
    ///
    /// The file creation will fail if there are no columns specified.
    /// Otherwise, this function will write the header to the file.
    /// In order to write rows to opened file, the @c append function
    /// should be called.
    ///
    /// @throw VersionedCSVFileError if schema has not been defined
    /// CSVFileError if an IO operation fails
    virtual void recreate();

    /// @brief Reads next row from the file file.
    ///
    /// This function will return the @c CSVRow object representing a
    /// parsed row if parsing is successful. If the end of file has been
    /// reached, the empty row is returned (a row containing no values).
    ///
    /// 1. If the row has fewer values than were found in the header it is
    /// discarded as invalid.
    ///
    /// 2. If the row is found to have more values than are defined in the
    /// schema it is discarded as invalid
    ///
    /// When a valid row has fewer than the defined number of columns, the
    /// values for each missing column is filled in with the default value
    /// specified by that column's descriptor.
    ///
    /// @param [out] row Object receiving the parsed CSV file.
    ///
    /// @return true if row has been read and validated; false if validation
    /// failed.
    bool next(CSVRow& row);

    /// @brief Returns the schema version of the physical file
    ///
    /// @return text version of the schema found or string "undefined" if the
    /// file has not been opened
    std::string getInputSchemaVersion() const;

    /// @brief text version of current schema supported by the file's metadata
    ///
    /// @return text version info assigned to the last column in the list of
    /// defined column, or the string "undefined" if no columns have been
    /// defined.
    std::string getSchemaVersion() const;

    /// @brief Fetch the column descriptor for a given index
    ///
    /// @param index index within the list of columns of the desired column
    /// @return a pointer to the VersionedColumn at the given index
    /// @throw OutOfRange exception if the index is invalid
    const VersionedColumnPtr& getVersionedColumn(const size_t index) const;

    /// @brief Fetches the state of the input file's schema
    ///
    /// Reflects that state of the input file's schema relative to the
    /// defined schema as a enum, InputSchemaState.
    ///
    /// @return VersionedCSVFile::CURRENT if the input file schema matches
    /// the defined schema, NEEDS_UPGRADE if the input file schema is older,
    /// and NEEDS_DOWNGRADE if it is newer
    enum InputSchemaState getInputSchemaState() const;

    /// @brief Returns true if the input file schema state is not CURRENT
    bool needsConversion() const;

protected:

    /// @brief Validates the header of a VersionedCSVFile
    ///
    /// This function is called internally when the reading in an existing
    /// file.  It parses the header row of the file, comparing each value
    /// in succession against the defined list of columns.  If the header
    /// contains too few matching columns (i.e. less than @c
    /// minimum_valid_columns_) or too many (more than the number of defined
    /// columns), the file is presumed to be either too old, too new, or too
    /// corrupt to process.  Otherwise it retains the number of valid columns
    /// found and deems the header valid.
    ///
    /// @param header A row holding a header.
    /// @return true if header matches the columns; false otherwise.
    virtual bool validateHeader(const CSVRow& header);

    /// @brief Convenience method for adding an error message
    ///
    /// Constructs an error message indicating that the number of columns
    /// in a given row are wrong and why, then adds it readMsg.
    ///
    /// @param row The row in error
    /// @param reason An explanation as to why the row column count is wrong
    void columnCountError(const CSVRow& row, const std::string& reason);

private:
    /// @brief Holds the collection of column descriptors
    std::vector<VersionedColumnPtr> columns_;

    /// @brief Number of valid columns present in input file. If this is less
    /// than the number of columns defined, this implies the input file is
    /// from an earlier version of the code.
    size_t valid_column_count_;

    /// @brief Minimum number of valid columns an input file must contain.
    /// If an input file does not meet this number it cannot be upgraded.
    size_t minimum_valid_columns_;

    /// @brief The number of columns found in the input header row
    /// This value represent the number of columns present, in the header
    /// valid or otherwise.
    size_t input_header_count_;

    /// @brief The state of the input schema in relation to the current schema
    enum InputSchemaState input_schema_state_;
};


} // namespace isc::util
} // namespace isc

#endif // VERSIONED_CSV_FILE_H