summaryrefslogtreecommitdiffstats
path: root/src/lib/dns/master_lexer.h
blob: a1db405f31d92a0130a68fc1de8ec42307295d2a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
// Copyright (C) 2012  Internet Systems Consortium, Inc. ("ISC")
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
// REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
// AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
// INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
// LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
// PERFORMANCE OF THIS SOFTWARE.

#ifndef MASTER_LEXER_H
#define MASTER_LEXER_H 1

#include <exceptions/exceptions.h>

#include <istream>
#include <string>

#include <stdint.h>

namespace isc {
namespace dns {
namespace master_lexer_internal {
class State;
}

/// \brief Tokenizer for parsing DNS master files.
///
/// The \c MasterLexer class provides tokenize interfaces for parsing DNS
/// master files.  It understands some special rules of master files as
/// defined in RFC 1035, such as comments, character escaping, and multi-line
/// data, and provides the user application with the actual data in a
/// more convenient form such as a std::string object.
///
/// In order to support the $INCLUDE notation, this class is designed to be
/// able to operate on multiple files or input streams in the nested way.
/// The \c pushSource() and \c popSource() methods correspond to the push
/// and pop operations.
///
/// While this class is public, it is less likely to be used by normal
/// applications; it's mainly expected to be used within this library,
/// specifically by the \c MasterLoader class and \c Rdata implementation
/// classes.
///
/// \note The error handling policy of this class is slightly different from
/// that of other classes of this library.  We generally throw an exception
/// for an invalid input, whether it's more likely to be a program error or
/// a "user error", which means an invalid input that comes from outside of
/// the library.  But, this class returns an error code for some certain
/// types of user errors instead of throwing an exception.  Such cases include
/// a syntax error identified by the lexer or a misspelled file name that
/// causes a system error at the time of open.  This is based on the assumption
/// that the main user of this class is a parser of master files, where
/// we want to give an option to ignore some non fatal errors and continue
/// the parsing.  This will be useful if it just performs overall error
/// checks on a master file.  When the (immediate) caller needs to do explicit
/// error handling, exceptions are not that a useful tool for error reporting
/// because we cannot separate the normal and error cases anyway, which would
/// be one major advantage when we use exceptions.  And, exceptions are
/// generally more expensive, either when it happens or just by being able
/// to handle with \c try and \c catch (depending on the underlying
/// implementation of the exception handling).  For these reasons, some of
/// this class does not throw for an error that would be reported as an
/// exception in other classes.
class MasterLexer {
    friend class master_lexer_internal::State;
public:
    /// \brief Exception thrown when we fail to read from the input
    /// stream or file.
    struct ReadError : public Unexpected {
        ReadError(const char* file, size_t line, const char* what) :
            Unexpected(file, line, what)
        {}
    };

    class Token;       // we define it separately for better readability

    /// \brief Options for getNextToken.
    ///
    /// A compound option, indicating multiple options are set, can be
    /// specified using the logical OR operator (operator|()).
    enum Options {
        NONE = 0,               ///< No option
        INITIAL_WS = 1, ///< recognize begin-of-line spaces after an
                        ///< end-of-line
        QSTRING = 2,    ///< recognize quoted string
        NUMBER = 4   ///< recognize numeric text as integer
    };

    /// \brief The constructor.
    ///
    /// \throw std::bad_alloc Internal resource allocation fails (rare case).
    MasterLexer();

    /// \brief The destructor.
    ///
    /// It internally closes any remaining input sources.
    ~MasterLexer();

    /// \brief Open a file and make it the current input source of MasterLexer.
    ///
    /// The opened file can be explicitly closed by the \c popSource() method;
    /// if \c popSource() is not called within the lifetime of the
    /// \c MasterLexer, it will be closed in the destructor.
    ///
    /// In the case possible system errors in opening the file (most likely
    /// because of specifying a non-existent or unreadable file), it returns
    /// false, and if the optional \c error parameter is non NULL, it will be
    /// set to a description of the error (any existing content of the string
    /// will be discarded).  If opening the file succeeds, the given
    /// \c error parameter will be intact.
    ///
    /// Note that this method has two styles of error reporting: one by
    /// returning \c false (and setting \c error optionally) and the other
    /// by throwing an exception.  See the note for the class description
    /// about the distinction.
    ///
    /// \throw InvalidParameter filename is NULL
    /// \param filename A non NULL string specifying a master file
    /// \param error If non null, a placeholder to set error description in
    /// case of failure.
    ///
    /// \return true if pushing the file succeeds; false otherwise.
    bool pushSource(const char* filename, std::string* error = NULL);

    /// \brief Make the given stream the current input source of MasterLexer.
    ///
    /// The caller still holds the ownership of the passed stream; it's the
    /// caller's responsibility to keep it valid as long as it's used in
    /// \c MasterLexer or to release any resource for the stream after that.
    /// The caller can explicitly tell \c MasterLexer to stop using the
    /// stream by calling the \c popSource() method.
    ///
    /// \param input An input stream object that produces textual
    /// representation of DNS RRs.
    void pushSource(std::istream& input);

    /// \brief Stop using the most recently opened input source (file or
    /// stream).
    ///
    /// If it's a file, the previously opened file will be closed internally.
    /// If it's a stream, \c MasterLexer will simply stop using
    /// the stream; the caller can assume it will be never used in
    /// \c MasterLexer thereafter.
    ///
    /// This method must not be called when there is no source pushed for
    /// \c MasterLexer.  This method is otherwise exception free.
    ///
    /// \throw isc::InvalidOperation Called with no pushed source.
    void popSource();

    /// \brief Return the name of the current input source name.
    ///
    /// If it's a file, it will be the C string given at the corresponding
    /// \c pushSource() call, that is, its filename.  If it's a stream, it will
    /// be formatted as \c "stream-%p" where \c %p is hex representation
    /// of the address of the stream object.
    ///
    /// If there is no opened source at the time of the call, this method
    /// returns an empty string.
    ///
    /// \throw std::bad_alloc Resource allocation failed for string
    /// construction (rare case)
    ///
    /// \return A string representation of the current source (see the
    /// description)
    std::string getSourceName() const;

    /// \brief Return the input source line number.
    ///
    /// If there is an opened source, the return value will be a non-0
    /// integer indicating the line number of the current source where
    /// the \c MasterLexer is currently working.  The expected usage of
    /// this value is to print a helpful error message when parsing fails
    /// by specifically identifying the position of the error.
    ///
    /// If there is no opened source at the time of the call, this method
    /// returns 0.
    ///
    /// \throw None
    ///
    /// \return The current line number of the source (see the description)
    size_t getSourceLine() const;

    /// \brief Parse and return another token from the input.
    ///
    /// It reads a bit of the last opened source and produces another token
    /// found in it.
    ///
    /// This method provides the weak exception guarantee. It'll return the
    /// class to the state before the call on exception, except that it is
    /// not possible to call ungetToken() after the failed call. This is
    /// considered OK, since the caller was expecting the call to succeed
    /// in which case the previous token would not be ungettable as well.
    ///
    /// \param options The options can be used to modify the tokenization.
    ///     The method can be made reporting things which are usually ignored
    ///     by this parameter. Multiple options can be passed at once by
    ///     bitwise or (eg. option1 | option 2). See description of available
    ///     options.
    /// \return Next token found in the input.
    /// \throw isc::InvalidOperation in case the source is not available. This
    ///     may mean the pushSource() has not been called yet, or that the
    ///     current source has been read past the end.
    /// \throw ReadError in case there's problem reading from the underlying
    ///     source (eg. I/O error in the file on the disk).
    /// \throw std::bad_alloc in case allocation of some internal resources
    ///     or the token fail.
    Token getNextToken(Options options = NONE);

    /// \brief Return the last token back to the lexer.
    ///
    /// The method undoes the lasts call to getNextToken(). If you call the
    /// getNextToken() again with the same options, it'll return the same
    /// token. If the options are different, it may return a different token,
    /// but it acts as if the previous getNextToken() was never called.
    ///
    /// It is possible to return only one token back in time (you can't call
    /// ungetToken() twice in a row without calling getNextToken() in between
    /// successfully).
    ///
    /// It does not work after change of source (by pushSource or popSource).
    ///
    /// \throw isc::InvalidOperation If called second time in a row or if
    ///     getNextToken() was not called since the last change of the source.
    void ungetToken();

protected:
    /// \brief Call the State::start()
    ///
    /// This calls the State::start() method and returns the result. It is
    /// a virtual method so tests can override it to mock some different
    /// behaviour.
    virtual const master_lexer_internal::State* start(Options options);

private:
    struct MasterLexerImpl;
    MasterLexerImpl* impl_;
};

/// \brief Operator to combine \c MasterLexer options
///
/// This is a trivial shortcut so that compound options can be specified
/// in an intuitive way.
inline MasterLexer::Options
operator|(MasterLexer::Options o1, MasterLexer::Options o2) {
    return (static_cast<MasterLexer::Options>(
                static_cast<unsigned>(o1) | static_cast<unsigned>(o2)));
}

/// \brief Tokens for \c MasterLexer
///
/// This is a simple value-class encapsulating a type of a lexer token and
/// (if it has a value) its value.  Essentially, the class provides
/// constructors corresponding to different types of tokens, and corresponding
/// getter methods.  The type and value are fixed at the time of construction
/// and will never be modified throughout the lifetime of the object.
/// The getter methods are still provided to maximize the safety; an
/// application cannot refer to a value that is invalid for the type of token.
///
/// This class is intentionally implemented as copyable and assignable
/// (using the default version of copy constructor and assignment operator),
/// but it's mainly for internal implementation convenience.  Applications will
/// simply refer to Token object as a reference via the \c MasterLexer class.
class MasterLexer::Token {
public:
    /// \brief Enumeration for token types
    ///
    /// \note At the time of initial implementation, all numeric tokens
    /// that would be extracted from \c MasterLexer should be represented
    /// as an unsigned 32-bit integer.  If we see the need for larger integers
    /// or negative numbers, we can then extend the token types.
    enum Type {
        END_OF_LINE, ///< End of line detected
        END_OF_FILE, ///< End of file detected
        INITIAL_WS,  ///< White spaces at the beginning of a line after an
                     ///< end of line (if asked for detecting it)
        NOVALUE_TYPE_MAX = INITIAL_WS, ///< Max integer corresponding to
                                       /// no-value (type only) types.
                                       /// Mainly for internal use.
        STRING, ///< A single string
        QSTRING, ///< A single string quoted by double-quotes (").
        NUMBER,  ///< A decimal number (unsigned 32-bit)
        ERROR    ///< Error detected in getting a token
    };

    /// \brief Enumeration for lexer error codes
    enum ErrorCode {
        NOT_STARTED, ///< The lexer is just initialized and has no token
        UNBALANCED_PAREN,       ///< Unbalanced parentheses detected
        UNEXPECTED_END, ///< The lexer reaches the end of line or file
                        /// unexpectedly
        UNBALANCED_QUOTES,      ///< Unbalanced quotations detected
        NO_TOKEN_PRODUCED, ///< No token was produced. This means programmer
                           /// error and should never get out of the lexer.
        MAX_ERROR_CODE ///< Max integer corresponding to valid error codes.
                       /// (excluding this one). Mainly for internal use.
    };

    /// \brief A simple representation of a range of a string.
    ///
    /// This is a straightforward pair of the start pointer of a string
    /// and its length.  The \c STRING and \c QSTRING types of tokens
    /// will be primarily represented in this form.
    ///
    /// Any character can be stored in the valid range of the region.
    /// In particular, there can be a nul character (\0) in the middle of
    /// the region.  On the other hand, it is not ensured that the string
    /// is nul-terminated.  So the usual string manipulation API may not work
    /// as expected.
    struct StringRegion {
        const char* beg;        ///< The start address of the string
        size_t len;             ///< The length of the string in bytes
    };

    /// \brief Constructor for non-value type of token.
    ///
    /// \throw InvalidParameter A value type token is specified.
    /// \param type The type of the token.  It must indicate a non-value
    /// type (not larger than \c NOVALUE_TYPE_MAX).
    explicit Token(Type type) : type_(type) {
        if (type > NOVALUE_TYPE_MAX) {
            isc_throw(InvalidParameter, "Token per-type constructor "
                      "called with invalid type: " << type);
        }
    }

    /// \brief Constructor for string and quoted-string types of token.
    ///
    /// The optional \c quoted parameter specifies whether it's a quoted or
    /// non quoted string.
    ///
    /// The string is specified as a pair of a pointer to the start address
    /// and its length.  Any character can be contained in any position of
    /// the valid range (see \c StringRegion).
    ///
    /// When it's a quoted string, the quotation marks must be excluded
    /// from the specified range.
    ///
    /// \param str_beg The start address of the string
    /// \param str_len The size of the string in bytes
    /// \param quoted true if it's a quoted string; false otherwise.
    Token(const char* str_beg, size_t str_len, bool quoted = false) :
        type_(quoted ? QSTRING : STRING)
    {
        val_.str_region_.beg = str_beg;
        val_.str_region_.len = str_len;
    }

    /// \brief Constructor for number type of token.
    ///
    /// \brief number An unsigned 32-bit integer corresponding to the token
    /// value.
    explicit Token(uint32_t number) : type_(NUMBER) {
        val_.number_ = number;
    }

    /// \brief Constructor for error type of token.
    ///
    /// \throw InvalidParameter Invalid error code value is specified.
    /// \brief error_code A pre-defined constant of \c ErrorCode.
    explicit Token(ErrorCode error_code) : type_(ERROR) {
        if (!(error_code < MAX_ERROR_CODE)) {
            isc_throw(InvalidParameter, "Invalid master lexer error code: "
                      << error_code);
        }
        val_.error_code_ = error_code;
    }

    /// \brief Return the token type.
    ///
    /// \throw none
    Type getType() const { return (type_); }

    /// \brief Return the value of a string-variant token.
    ///
    /// \throw InvalidOperation Called on a non string-variant types of token.
    /// \return A reference to \c StringRegion corresponding to the string
    ///         token value.
    const StringRegion& getStringRegion() const {
        if (type_ != STRING && type_ != QSTRING) {
            isc_throw(InvalidOperation,
                      "Token::getStringRegion() for non string-variant type");
        }
        return (val_.str_region_);
    }

    /// \brief Return the value of a string-variant token as a string object.
    ///
    /// Note that the underlying string may contain a nul (\0) character
    /// in the middle.  The returned string object will contain all characters
    /// of the valid range of the underlying string.  So some string
    /// operations such as c_str() may not work as expected.
    ///
    /// \throw InvalidOperation Called on a non string-variant types of token.
    /// \throw std::bad_alloc Resource allocation failure in constructing the
    ///                       string object.
    /// \return A std::string object corresponding to the string token value.
    std::string getString() const {
        std::string ret;
        getString(ret);
        return (ret);
    }

    /// \brief Fill in a string with the value of a string-variant token.
    ///
    /// This is similar to the other version of \c getString(), but
    /// the caller is supposed to pass a placeholder string object.
    /// This will be more efficient if the caller uses the same
    /// \c MasterLexer repeatedly and needs to get string token in the
    /// form of a string object many times as this version could reuse
    /// the existing internal storage of the passed string.
    ///
    /// Any existing content of the passed string will be removed.
    ///
    /// \throw InvalidOperation Called on a non string-variant types of token.
    /// \throw std::bad_alloc Resource allocation failure in constructing the
    ///                       string object.
    ///
    /// \param ret A string object to be filled with the token string.
    void getString(std::string& ret) const {
        if (type_ != STRING && type_ != QSTRING) {
            isc_throw(InvalidOperation,
                      "Token::getString() for non string-variant type");
        }
        ret.assign(val_.str_region_.beg,
                   val_.str_region_.beg + val_.str_region_.len);
    }

    /// \brief Return the value of a string-variant token as a string object.
    ///
    /// \throw InvalidOperation Called on a non number type of token.
    /// \return The integer corresponding to the number token value.
    uint32_t getNumber() const {
        if (type_ != NUMBER) {
            isc_throw(InvalidOperation,
                      "Token::getNumber() for non number type");
        }
        return (val_.number_);
    }

    /// \brief Return the error code of a error type token.
    ///
    /// \throw InvalidOperation Called on a non error type of token.
    /// \return The error code of the token.
    ErrorCode getErrorCode() const {
        if (type_ != ERROR) {
            isc_throw(InvalidOperation,
                      "Token::getErrorCode() for non error type");
        }
        return (val_.error_code_);
    };

    /// \brief Return a textual description of the error of a error type token.
    ///
    /// The returned string would be useful to produce a log message when
    /// a zone file parser encounters an error.
    ///
    /// \throw InvalidOperation Called on a non error type of token.
    /// \throw std::bad_alloc Resource allocation failure in constructing the
    ///                       string object.
    /// \return A string object that describes the meaning of the error.
    std::string getErrorText() const;

private:
    Type type_;    // this is not const so the class can be assignable

    // We use a union to represent different types of token values via the
    // unified Token class.  The class integrity should ensure valid operation
    // on the union; getter methods should only refer to the member set at
    // the construction.
    union {
        StringRegion str_region_;
        uint32_t number_;
        ErrorCode error_code_;
    } val_;
};

} // namespace dns
} // namespace isc
#endif  // MASTER_LEXER_H

// Local Variables:
// mode: c++
// End: