src/lib/dns/master_lexer.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613

// Copyright (C) 2012-2024 Internet Systems Consortium, Inc. ("ISC")
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

#include <config.h>

#include <exceptions/exceptions.h>

#include <dns/master_lexer.h>
#include <dns/master_lexer_inputsource.h>
#include <dns/master_lexer_state.h>

#include <boost/shared_ptr.hpp>
#include <boost/lexical_cast.hpp>

#include <bitset>
#include <cassert>
#include <limits>
#include <string>
#include <vector>

namespace isc {
namespace dns {

// The definition of SOURCE_SIZE_UNKNOWN.  Note that we initialize it using
// a method of another library.  Technically, this could trigger a static
// initialization fiasco.  But in this particular usage it's very unlikely
// to happen because this value is expected to be used only as a return
// value of a MasterLexer's method, and its constructor needs definitions
// here.
const size_t MasterLexer::SOURCE_SIZE_UNKNOWN =
    std::numeric_limits<size_t>::max();

namespace {
typedef boost::shared_ptr<master_lexer_internal::InputSource> InputSourcePtr;
} // end unnamed namespace
using namespace master_lexer_internal;


struct MasterLexer::MasterLexerImpl {
    MasterLexerImpl() : source_(NULL), token_(MasterToken::NOT_STARTED),
                        total_size_(0), popped_size_(0),
                        paren_count_(0), last_was_eol_(true),
                        has_previous_(false),
                        previous_paren_count_(0),
                        previous_was_eol_(false)
    {
        separators_.set('\r');
        separators_.set('\n');
        separators_.set(' ');
        separators_.set('\t');
        separators_.set('(');
        separators_.set(')');
        separators_.set('"');
        esc_separators_.set('\r');
        esc_separators_.set('\n');
    }

    // A helper method to skip possible comments toward the end of EOL or EOF.
    // commonly used by state classes.  It returns the corresponding "end-of"
    // character in case it's a comment; otherwise it simply returns the
    // current character.
    int skipComment(int c, bool escaped = false) {
        if (c == ';' && !escaped) {
            while (true) {
                c = source_->getChar();
                if (c == '\n' || c == InputSource::END_OF_STREAM) {
                    return (c);
                }
            }
        }
        return (c);
    }

    bool isTokenEnd(int c, bool escaped) {
        // Special case of EOF (end of stream); this is not in the bitmaps
        if (c == InputSource::END_OF_STREAM) {
            return (true);
        }
        // In this implementation we only ensure the behavior for unsigned
        // range of characters, so we restrict the range of the values up to
        // 0x7f = 127
        return (escaped ? esc_separators_.test(c & 0x7f) :
                separators_.test(c & 0x7f));
    }

    void setTotalSize() {
        assert(source_ != NULL);
        if (total_size_ != SOURCE_SIZE_UNKNOWN) {
            const size_t current_size = source_->getSize();
            if (current_size != SOURCE_SIZE_UNKNOWN) {
                total_size_ += current_size;
            } else {
                total_size_ = SOURCE_SIZE_UNKNOWN;
            }
        }
    }

    std::vector<InputSourcePtr> sources_;
    InputSource* source_;       // current source (NULL if sources_ is empty)
    MasterToken token_;         // currently recognized token (set by a state)
    std::vector<char> data_;    // placeholder for string data

    // Keep track of the total size of all sources and characters that have
    // been read from sources already popped.
    size_t total_size_;         // accumulated size (# of chars) of sources
    size_t popped_size_;        // total size of sources that have been popped

    // These are used in states, and defined here only as a placeholder.
    // The main lexer class does not need these members.
    size_t paren_count_;        // nest count of the parentheses
    bool last_was_eol_; // whether the lexer just passed an end-of-line

    // Bitmaps that gives whether a given (positive) character should be
    // considered a separator of a string/number token.  The esc_ version
    // is a subset of the other, excluding characters that can be ignored
    // if escaped by a backslash.  See isTokenEnd() for the bitmap size.
    std::bitset<128> separators_;
    std::bitset<128> esc_separators_;

    // These are to allow restoring state before previous token.
    bool has_previous_;
    size_t previous_paren_count_;
    bool previous_was_eol_;
};

MasterLexer::MasterLexer() : impl_(new MasterLexerImpl) {
}

MasterLexer::~MasterLexer() {
    delete impl_;
}

bool
MasterLexer::pushSource(const char* filename, std::string* error) {
    if (filename == NULL) {
        isc_throw(InvalidParameter,
                  "NULL filename for MasterLexer::pushSource");
    }
    try {
        impl_->sources_.push_back(InputSourcePtr(new InputSource(filename)));
    } catch (const InputSource::OpenError& ex) {
        if (error != NULL) {
            *error = ex.what();
        }
        return (false);
    }

    impl_->source_ = impl_->sources_.back().get();
    impl_->has_previous_ = false;
    impl_->last_was_eol_ = true;
    impl_->setTotalSize();
    return (true);
}

void
MasterLexer::pushSource(std::istream& input) {
    try {
        impl_->sources_.push_back(InputSourcePtr(new InputSource(input)));
    } catch (const InputSource::OpenError& ex) {
        // Convert the "internal" exception to public one.
        isc_throw(Unexpected, "Failed to push a stream to lexer: " <<
                  ex.what());
    }
    impl_->source_ = impl_->sources_.back().get();
    impl_->has_previous_ = false;
    impl_->last_was_eol_ = true;
    impl_->setTotalSize();
}

void
MasterLexer::popSource() {
    if (impl_->sources_.empty()) {
        isc_throw(InvalidOperation,
                  "MasterLexer::popSource on an empty source");
    }
    impl_->popped_size_ += impl_->source_->getPosition();
    impl_->sources_.pop_back();
    impl_->source_ = impl_->sources_.empty() ? NULL :
        impl_->sources_.back().get();
    impl_->has_previous_ = false;
}

size_t
MasterLexer::getSourceCount() const {
    return (impl_->sources_.size());
}

std::string
MasterLexer::getSourceName() const {
    if (impl_->sources_.empty()) {
        return (std::string());
    }
    return (impl_->sources_.back()->getName());
}

size_t
MasterLexer::getSourceLine() const {
    if (impl_->sources_.empty()) {
        return (0);
    }
    return (impl_->sources_.back()->getCurrentLine());
}

size_t
MasterLexer::getTotalSourceSize() const {
    return (impl_->total_size_);
}

size_t
MasterLexer::getPosition() const {
    size_t position = impl_->popped_size_;
    for (auto const& src : impl_->sources_) {
        position += src->getPosition();
    }
    return (position);
}

const MasterToken&
MasterLexer::getNextToken(Options options) {
    if (impl_->source_ == NULL) {
        isc_throw(isc::InvalidOperation, "No source to read tokens from");
    }
    // Store the current state so we can restore it in ungetToken
    impl_->previous_paren_count_ = impl_->paren_count_;
    impl_->previous_was_eol_ = impl_->last_was_eol_;
    impl_->source_->mark();
    impl_->has_previous_ = true;
    // Reset the token now. This is to check a token was actually produced.
    // This is debugging aid.
    impl_->token_ = MasterToken(MasterToken::NO_TOKEN_PRODUCED);
    // And get the token

    // This actually handles EOF internally too.
    const State* state = State::start(*this, options);
    if (state != NULL) {
        state->handle(*this);
    }
    // Make sure a token was produced. Since this Can Not Happen, we assert
    // here instead of throwing.
    assert(impl_->token_.getType() != MasterToken::ERROR ||
           impl_->token_.getErrorCode() != MasterToken::NO_TOKEN_PRODUCED);
    return (impl_->token_);
}

namespace {
inline MasterLexer::Options
optionsForTokenType(MasterToken::Type expect) {
    switch (expect) {
    case MasterToken::STRING:
        return (MasterLexer::NONE);
    case MasterToken::QSTRING:
        return (MasterLexer::QSTRING);
    case MasterToken::NUMBER:
        return (MasterLexer::NUMBER);
    default:
        isc_throw(InvalidParameter,
                  "expected type for getNextToken not supported: " << expect);
    }
}
}

const MasterToken&
MasterLexer::getNextToken(MasterToken::Type expect, bool eol_ok) {
    // Get the next token, specifying an appropriate option corresponding to
    // the expected type.  The result should be set in impl_->token_.
    getNextToken(optionsForTokenType(expect));

    if (impl_->token_.getType() == MasterToken::ERROR) {
        if (impl_->token_.getErrorCode() == MasterToken::NUMBER_OUT_OF_RANGE) {
            ungetToken();
        }
        throw LexerError(__FILE__, __LINE__, impl_->token_);
    }

    const bool is_eol_like =
        (impl_->token_.getType() == MasterToken::END_OF_LINE ||
         impl_->token_.getType() == MasterToken::END_OF_FILE);
    if (eol_ok && is_eol_like) {
        return (impl_->token_);
    }
    if (impl_->token_.getType() == MasterToken::STRING &&
        expect == MasterToken::QSTRING) {
        return (impl_->token_);
    }
    if (impl_->token_.getType() != expect) {
        ungetToken();
        if (is_eol_like) {
            throw LexerError(__FILE__, __LINE__,
                             MasterToken(MasterToken::UNEXPECTED_END));
        }
        assert(expect == MasterToken::NUMBER);
        throw LexerError(__FILE__, __LINE__,
                         MasterToken(MasterToken::BAD_NUMBER));
    }

    return (impl_->token_);
}

void
MasterLexer::ungetToken() {
    if (impl_->has_previous_) {
        impl_->has_previous_ = false;
        impl_->source_->ungetAll();
        impl_->last_was_eol_ = impl_->previous_was_eol_;
        impl_->paren_count_ = impl_->previous_paren_count_;
    } else {
        isc_throw(isc::InvalidOperation, "No token to unget ready");
    }
}

namespace {
const char* const error_text[] = {
    "lexer not started",        // NOT_STARTED
    "unbalanced parentheses",   // UNBALANCED_PAREN
    "unexpected end of input",  // UNEXPECTED_END
    "unbalanced quotes",        // UNBALANCED_QUOTES
    "no token produced",        // NO_TOKEN_PRODUCED
    "number out of range",      // NUMBER_OUT_OF_RANGE
    "not a valid number",       // BAD_NUMBER
    "unexpected quotes"         // UNEXPECTED_QUOTES
};
const size_t error_text_max_count = sizeof(error_text) / sizeof(error_text[0]);
} // end unnamed namespace

std::string
MasterToken::getErrorText() const {
    if (type_ != ERROR) {
        isc_throw(InvalidOperation,
                  "MasterToken::getErrorText() for non error type");
    }

    // The class integrity ensures the following:
    assert(val_.error_code_ < error_text_max_count);
    return (error_text[val_.error_code_]);
}

namespace master_lexer_internal {
// Below we implement state classes for state transitions of MasterLexer.
// Note that these need to be defined here so that they can refer to
// the details of MasterLexerImpl.

bool
State::wasLastEOL(const MasterLexer& lexer) const {
    return (lexer.impl_->last_was_eol_);
}

const MasterToken&
State::getToken(const MasterLexer& lexer) const {
    return (lexer.impl_->token_);
}

size_t
State::getParenCount(const MasterLexer& lexer) const {
    return (lexer.impl_->paren_count_);
}

namespace {
class CRLF : public State {
public:
    CRLF() {}
    virtual ~CRLF() {}          // see the base class for the destructor
    virtual void handle(MasterLexer& lexer) const {
        // We've just seen '\r'.  If this is part of a sequence of '\r\n',
        // we combine them as a single END-OF-LINE.  Otherwise we treat the
        // single '\r' as an EOL and continue tokenization from the character
        // immediately after '\r'.  One tricky case is that there's a comment
        // between '\r' and '\n'.  This implementation combines these
        // characters and treats them as a single EOL (the behavior derived
        // from BIND 9).  Technically this may not be correct, but in practice
        // the caller wouldn't distinguish this case from the case it has
        // two EOLs, so we simplify the process.
        const int c = getLexerImpl(lexer)->skipComment(
            getLexerImpl(lexer)->source_->getChar());
        if (c != '\n') {
            getLexerImpl(lexer)->source_->ungetChar();
        }
        getLexerImpl(lexer)->token_ = MasterToken(MasterToken::END_OF_LINE);
        getLexerImpl(lexer)->last_was_eol_ = true;
    }
};

class String : public State {
public:
    String() {}
    virtual ~String() {}      // see the base class for the destructor
    virtual void handle(MasterLexer& lexer) const;
};

class QString : public State {
public:
    QString() {}
    virtual ~QString() {}      // see the base class for the destructor
    virtual void handle(MasterLexer& lexer) const;
};

class Number : public State {
public:
    Number() {}
    virtual ~Number() {}
    virtual void handle(MasterLexer& lexer) const;
};

// We use a common instance of a each state in a singleton-like way to save
// construction overhead.  They are not singletons in its strict sense as
// we don't prohibit direct construction of these objects.  But that doesn't
// matter much anyway, because the definitions are completely hidden within
// this file.
const CRLF CRLF_STATE;
const String STRING_STATE;
const QString QSTRING_STATE;
const Number NUMBER_STATE;
} // end unnamed namespace

const State&
State::getInstance(ID state_id) {
    switch (state_id) {
    case CRLF:
        return (CRLF_STATE);
    case String:
        return (STRING_STATE);
    case QString:
        return (QSTRING_STATE);
    case Number:
        return (NUMBER_STATE);
    }

    // This is a bug of the caller, and this method is only expected to be
    // used by tests, so we just forcefully make it fail by asserting the
    // condition.
    assert(false);
    return (STRING_STATE); // a dummy return, to silence some compilers.
}

const State*
State::start(MasterLexer& lexer, MasterLexer::Options options) {
    // define some shortcuts
    MasterLexer::MasterLexerImpl& lexerimpl = *lexer.impl_;
    size_t& paren_count = lexerimpl.paren_count_;

    // Note: the if-else in the loop is getting complicated.  When we complete
    // #2374, revisit the organization to see if we need a fundamental
    // refactoring.
    while (true) {
        const int c = lexerimpl.skipComment(lexerimpl.source_->getChar());
        if (c == InputSource::END_OF_STREAM) {
            lexerimpl.last_was_eol_ = false;
            if (paren_count != 0) {
                lexerimpl.token_ = MasterToken(MasterToken::UNBALANCED_PAREN);
                paren_count = 0; // reset to 0; this helps in lenient mode.
                return (NULL);
            }
            lexerimpl.token_ = MasterToken(MasterToken::END_OF_FILE);
            return (NULL);
        } else if (c == ' ' || c == '\t') {
            // If requested and we are not in (), recognize the initial space.
            if (lexerimpl.last_was_eol_ && paren_count == 0 &&
                (options & MasterLexer::INITIAL_WS) != 0) {
                lexerimpl.last_was_eol_ = false;
                lexerimpl.token_ = MasterToken(MasterToken::INITIAL_WS);
                return (NULL);
            }
        } else if (c == '\n') {
            lexerimpl.last_was_eol_ = true;
            if (paren_count == 0) { // we don't recognize EOL if we are in ()
                lexerimpl.token_ = MasterToken(MasterToken::END_OF_LINE);
                return (NULL);
            }
        } else if (c == '\r') {
            if (paren_count == 0) { // check if we are in () (see above)
                return (&CRLF_STATE);
            }
        } else if (c == '"') {
            if ((options & MasterLexer::QSTRING) != 0) {
                lexerimpl.last_was_eol_ = false;
                return (&QSTRING_STATE);
            } else {
                lexerimpl.token_ = MasterToken(MasterToken::UNEXPECTED_QUOTES);
                return (NULL);
            }
        } else if (c == '(') {
            lexerimpl.last_was_eol_ = false;
            ++paren_count;
        } else if (c == ')') {
            lexerimpl.last_was_eol_ = false;
            if (paren_count == 0) {
                lexerimpl.token_ = MasterToken(MasterToken::UNBALANCED_PAREN);
                return (NULL);
            }
            --paren_count;
        } else if ((options & MasterLexer::NUMBER) != 0 &&isdigit(c)) {
            lexerimpl.last_was_eol_ = false;
            // this character will be handled in the number state
            lexerimpl.source_->ungetChar();
            return (&NUMBER_STATE);
        } else {
            // this character will be handled in the string state
            lexerimpl.source_->ungetChar();
            lexerimpl.last_was_eol_ = false;
            return (&STRING_STATE);
        }
        // no code should be here; we just continue the loop.
    }
}

void
String::handle(MasterLexer& lexer) const {
    std::vector<char>& data = getLexerImpl(lexer)->data_;
    data.clear();

    bool escaped = false;
    while (true) {
        const int c = getLexerImpl(lexer)->skipComment(
            getLexerImpl(lexer)->source_->getChar(), escaped);

        if (getLexerImpl(lexer)->isTokenEnd(c, escaped)) {
            getLexerImpl(lexer)->source_->ungetChar();
            // make sure it nul-terminated as a c-str (excluded from token
            // data).
            data.push_back('\0');
            getLexerImpl(lexer)->token_ =
                MasterToken(&data.at(0), data.size() - 1);
            return;
        }
        escaped = (c == '\\' && !escaped);
        data.push_back(c);
    }
}

void
QString::handle(MasterLexer& lexer) const {
    MasterToken& token = getLexerImpl(lexer)->token_;
    std::vector<char>& data = getLexerImpl(lexer)->data_;
    data.clear();

    bool escaped = false;
    while (true) {
        const int c = getLexerImpl(lexer)->source_->getChar();
        if (c == InputSource::END_OF_STREAM) {
            token = MasterToken(MasterToken::UNEXPECTED_END);
            return;
        } else if (c == '"') {
            if (escaped) {
                // found escaped '"'. overwrite the preceding backslash.
                assert(!data.empty());
                escaped = false;
                data.back() = '"';
            } else {
                // make sure it nul-terminated as a c-str (excluded from token
                // data).  This also simplifies the case of an empty string.
                data.push_back('\0');
                token = MasterToken(&data.at(0), data.size() - 1, true);
                return;
            }
        } else if (c == '\n' && !escaped) {
            getLexerImpl(lexer)->source_->ungetChar();
            token = MasterToken(MasterToken::UNBALANCED_QUOTES);
            return;
        } else {
            escaped = (c == '\\' && !escaped);
            data.push_back(c);
        }
    }
}

void
Number::handle(MasterLexer& lexer) const {
    MasterToken& token = getLexerImpl(lexer)->token_;

    // It may yet turn out to be a string, so we first
    // collect all the data
    bool digits_only = true;
    std::vector<char>& data = getLexerImpl(lexer)->data_;
    data.clear();
    bool escaped = false;

    while (true) {
        const int c = getLexerImpl(lexer)->skipComment(
            getLexerImpl(lexer)->source_->getChar(), escaped);
        if (getLexerImpl(lexer)->isTokenEnd(c, escaped)) {
            getLexerImpl(lexer)->source_->ungetChar();
            // We need to close the string whether it's digits-only (for
            // lexical_cast) or not (see String::handle()).
            data.push_back('\0');
            if (digits_only) {
                try {
                    const uint32_t number32 =
                        boost::lexical_cast<uint32_t, const char*>(&data[0]);
                    token = MasterToken(number32);
                } catch (const boost::bad_lexical_cast&) {
                    // Since we already know we have only digits,
                    // range should be the only possible problem.
                    token = MasterToken(MasterToken::NUMBER_OUT_OF_RANGE);
                }
            } else {
                token = MasterToken(&data.at(0), data.size() - 1);
            }
            return;
        }
        if (!isdigit(c)) {
            digits_only = false;
        }
        escaped = (c == '\\' && !escaped);
        data.push_back(c);
    }
}

} // namespace master_lexer_internal

} // end of namespace dns
} // end of namespace isc