diff options
author | JINMEI Tatuya <jinmei@isc.org> | 2012-11-13 22:12:20 +0100 |
---|---|---|
committer | JINMEI Tatuya <jinmei@isc.org> | 2012-11-13 22:12:20 +0100 |
commit | 9a11ef62ae36d9d891e87ba792fa249ae82f0736 (patch) | |
tree | 5add7ee45614d0a9962e5a14e7654245bf6f31dd /src/lib/dns | |
parent | [master] Merge branch 'trac2463' (diff) | |
parent | [2372] clarified that INITIAL_WS is recognized only after an EOL. (diff) | |
download | kea-9a11ef62ae36d9d891e87ba792fa249ae82f0736.tar.xz kea-9a11ef62ae36d9d891e87ba792fa249ae82f0736.zip |
[master] Merge branch 'trac2372'
Diffstat (limited to 'src/lib/dns')
-rw-r--r-- | src/lib/dns/Makefile.am | 1 | ||||
-rw-r--r-- | src/lib/dns/master_lexer.cc | 170 | ||||
-rw-r--r-- | src/lib/dns/master_lexer.h | 29 | ||||
-rw-r--r-- | src/lib/dns/master_lexer_state.h | 138 | ||||
-rw-r--r-- | src/lib/dns/tests/Makefile.am | 1 | ||||
-rw-r--r-- | src/lib/dns/tests/master_lexer_state_unittest.cc | 256 |
6 files changed, 592 insertions, 3 deletions
diff --git a/src/lib/dns/Makefile.am b/src/lib/dns/Makefile.am index e81ef76a8d..14b74f7d27 100644 --- a/src/lib/dns/Makefile.am +++ b/src/lib/dns/Makefile.am @@ -97,6 +97,7 @@ libb10_dns___la_SOURCES += master_lexer_inputsource.h master_lexer_inputsource.c libb10_dns___la_SOURCES += labelsequence.h labelsequence.cc libb10_dns___la_SOURCES += masterload.h masterload.cc libb10_dns___la_SOURCES += master_lexer.h master_lexer.cc +libb10_dns___la_SOURCES += master_lexer_state.h libb10_dns___la_SOURCES += message.h message.cc libb10_dns___la_SOURCES += messagerenderer.h messagerenderer.cc libb10_dns___la_SOURCES += name.h name.cc diff --git a/src/lib/dns/master_lexer.cc b/src/lib/dns/master_lexer.cc index c9c5528995..992a051048 100644 --- a/src/lib/dns/master_lexer.cc +++ b/src/lib/dns/master_lexer.cc @@ -16,6 +16,7 @@ #include <dns/master_lexer.h> #include <dns/master_lexer_inputsource.h> +#include <dns/master_lexer_state.h> #include <boost/shared_ptr.hpp> @@ -32,10 +33,34 @@ typedef boost::shared_ptr<master_lexer_internal::InputSource> InputSourcePtr; using namespace master_lexer_internal; struct MasterLexer::MasterLexerImpl { - MasterLexerImpl() : token_(Token::NOT_STARTED) {} + MasterLexerImpl() : source_(NULL), token_(Token::NOT_STARTED), + paren_count_(0), last_was_eol_(false) + {} + + // A helper method to skip possible comments toward the end of EOL or EOF. + // commonly used by state classes. It returns the corresponding "end-of" + // character in case it's a comment; otherwise it simply returns the + // current character. + int skipComment(int c) { + if (c == ';') { + while (true) { + c = source_->getChar(); + if (c == '\n' || c == InputSource::END_OF_STREAM) { + return (c); + } + } + } + return (c); + } std::vector<InputSourcePtr> sources_; - Token token_; + InputSource* source_; // current source (NULL if sources_ is empty) + Token token_; // currently recognized token (set by a state) + + // These are used in states, and defined here only as a placeholder. + // The main lexer class does not need these members. + size_t paren_count_; // nest count of the parentheses + bool last_was_eol_; // whether the lexer just passed an end-of-line }; MasterLexer::MasterLexer() : impl_(new MasterLexerImpl) { @@ -60,12 +85,14 @@ MasterLexer::pushSource(const char* filename, std::string* error) { return (false); } + impl_->source_ = impl_->sources_.back().get(); return (true); } void MasterLexer::pushSource(std::istream& input) { impl_->sources_.push_back(InputSourcePtr(new InputSource(input))); + impl_->source_ = impl_->sources_.back().get(); } void @@ -75,6 +102,8 @@ MasterLexer::popSource() { "MasterLexer::popSource on an empty source"); } impl_->sources_.pop_back(); + impl_->source_ = impl_->sources_.empty() ? NULL : + impl_->sources_.back().get(); } std::string @@ -115,5 +144,142 @@ MasterLexer::Token::getErrorText() const { return (error_text[val_.error_code_]); } +namespace master_lexer_internal { +// Below we implement state classes for state transitions of MasterLexer. +// Note that these need to be defined here so that they can refer to +// the details of MasterLexerImpl. + +typedef MasterLexer::Token Token; // convenience shortcut + +bool +State::wasLastEOL(const MasterLexer& lexer) const { + return (lexer.impl_->last_was_eol_); +} + +const MasterLexer::Token& +State::getToken(const MasterLexer& lexer) const { + return (lexer.impl_->token_); +} + +size_t +State::getParenCount(const MasterLexer& lexer) const { + return (lexer.impl_->paren_count_); +} + +namespace { +class CRLF : public State { +public: + CRLF() {} + virtual const State* handle(MasterLexer& lexer) const { + // We've just seen '\r'. If this is part of a sequence of '\r\n', + // we combine them as a single END-OF-LINE. Otherwise we treat the + // single '\r' as an EOL and continue tokeniziation from the character + // immediately after '\r'. One tricky case is that there's a comment + // between '\r' and '\n'. This implementation combines these + // characters and treats them as a single EOL (the behavior derived + // from BIND 9). Technically this may not be correct, but in practice + // the caller wouldn't distinguish this case from the case it has + // two EOLs, so we simplify the process. + const int c = getLexerImpl(lexer)->skipComment( + getLexerImpl(lexer)->source_->getChar()); + if (c != '\n') { + getLexerImpl(lexer)->source_->ungetChar(); + } + getLexerImpl(lexer)->token_ = Token(Token::END_OF_LINE); + getLexerImpl(lexer)->last_was_eol_ = true; + return (NULL); + } +}; + +// Currently this is provided mostly as a place holder +class String : public State { +public: + String() {} + virtual const State* handle(MasterLexer& /*lexer*/) const { + return (NULL); + } +}; + +// We use a common instance of a each state in a singleton-like way to save +// construction overhead. They are not singletons in its strict sense as +// we don't prohibit direct construction of these objects. But that doesn't +// matter much anyway, because the definitions are completely hidden within +// this file. +const CRLF CRLF_STATE; +const String STRING_STATE; +} + +const State& +State::getInstance(ID state_id) { + switch (state_id) { + case CRLF: + return (CRLF_STATE); + case String: + return (STRING_STATE); + } + + // This is a bug of the caller, and this method is only expected to be + // used by tests, so we just forcefully make it fail by asserting the + // condition. + assert(false); + return (STRING_STATE); // a dummy return, to silence some compilers. +} + +const State* +State::start(MasterLexer& lexer, MasterLexer::Options options) { + // define some shortcuts + MasterLexer::MasterLexerImpl& lexerimpl = *lexer.impl_; + size_t& paren_count = lexerimpl.paren_count_; + + while (true) { + const int c = lexerimpl.skipComment(lexerimpl.source_->getChar()); + if (c == InputSource::END_OF_STREAM) { + lexerimpl.last_was_eol_ = false; + if (paren_count != 0) { + lexerimpl.token_ = Token(Token::UNBALANCED_PAREN); + paren_count = 0; // reset to 0; this helps in lenient mode. + return (NULL); + } + lexerimpl.token_ = Token(Token::END_OF_FILE); + return (NULL); + } else if (c == ' ' || c == '\t') { + // If requested and we are not in (), recognize the initial space. + if (lexerimpl.last_was_eol_ && paren_count == 0 && + (options & MasterLexer::INITIAL_WS) != 0) { + lexerimpl.last_was_eol_ = false; + lexerimpl.token_ = Token(Token::INITIAL_WS); + return (NULL); + } + } else if (c == '\n') { + lexerimpl.last_was_eol_ = true; + if (paren_count == 0) { // we don't recognize EOL if we are in () + lexerimpl.token_ = Token(Token::END_OF_LINE); + return (NULL); + } + } else if (c == '\r') { + if (paren_count == 0) { // check if we are in () (see above) + return (&CRLF_STATE); + } + } else if (c == '(') { + lexerimpl.last_was_eol_ = false; + ++paren_count; + } else if (c == ')') { + lexerimpl.last_was_eol_ = false; + if (paren_count == 0) { + lexerimpl.token_ = Token(Token::UNBALANCED_PAREN); + return (NULL); + } + --paren_count; + } else { + // Note: in #2373 we should probably ungetChar(). + lexerimpl.last_was_eol_ = false; + return (&STRING_STATE); + } + // no code should be here; we just continue the loop. + } +} + +} // namespace master_lexer_internal + } // end of namespace dns } // end of namespace isc diff --git a/src/lib/dns/master_lexer.h b/src/lib/dns/master_lexer.h index da6bb5dee2..854d602e03 100644 --- a/src/lib/dns/master_lexer.h +++ b/src/lib/dns/master_lexer.h @@ -24,6 +24,9 @@ namespace isc { namespace dns { +namespace master_lexer_internal { +class State; +} /// \brief Tokenizer for parsing DNS master files. /// @@ -64,9 +67,22 @@ namespace dns { /// this class does not throw for an error that would be reported as an /// exception in other classes. class MasterLexer { + friend class master_lexer_internal::State; public: class Token; // we define it separately for better readability + /// \brief Options for getNextToken. + /// + /// A compound option, indicating multiple options are set, can be + /// specified using the logical OR operator (operator|()). + enum Options { + NONE = 0, ///< No option + INITIAL_WS = 1, ///< recognize begin-of-line spaces after an + ///< end-of-line + QSTRING = 2, ///< recognize quoted string + NUMBER = 4 ///< recognize numeric text as integer + }; + /// \brief The constructor. /// /// \throw std::bad_alloc Internal resource allocation fails (rare case). @@ -167,6 +183,16 @@ private: MasterLexerImpl* impl_; }; +/// \brief Operator to combine \c MasterLexer options +/// +/// This is a trivial shortcut so that compound options can be specified +/// in an intuitive way. +inline MasterLexer::Options +operator|(MasterLexer::Options o1, MasterLexer::Options o2) { + return (static_cast<MasterLexer::Options>( + static_cast<unsigned>(o1) | static_cast<unsigned>(o2))); +} + /// \brief Tokens for \c MasterLexer /// /// This is a simple value-class encapsulating a type of a lexer token and @@ -192,7 +218,8 @@ public: enum Type { END_OF_LINE, ///< End of line detected (if asked for detecting it) END_OF_FILE, ///< End of file detected (if asked for detecting it) - INITIAL_WS, ///< White spaces at the beginning of a line + INITIAL_WS, ///< White spaces at the beginning of a line after an + ///< end of line NOVALUE_TYPE_MAX = INITIAL_WS, ///< Max integer corresponding to /// no-value (type only) types. /// Mainly for internal use. diff --git a/src/lib/dns/master_lexer_state.h b/src/lib/dns/master_lexer_state.h new file mode 100644 index 0000000000..86957c5e3f --- /dev/null +++ b/src/lib/dns/master_lexer_state.h @@ -0,0 +1,138 @@ +// Copyright (C) 2012 Internet Systems Consortium, Inc. ("ISC") +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH +// REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY +// AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, +// INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +// LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +// PERFORMANCE OF THIS SOFTWARE. + +#ifndef MASTER_LEXER_STATE_H +#define MASTER_LEXER_STATE_H 1 + +#include <dns/master_lexer.h> + +namespace isc { +namespace dns { + +namespace master_lexer_internal { + +/// \brief Tokenization state for \c MasterLexer. +/// +/// This is a base class of classes that represent various states of a single +/// tokenization session of \c MasterLexer, i.e., the states used for a +/// single call to \c MasterLexer::getNextToken(). +/// +/// It follows the convention of the state design pattern: each derived class +/// corresponds to a specific state, and the state transition takes place +/// through the virtual method named \c handle(). The \c handle() method +/// takes the main \c MasterLexer object that holds all necessary internal +/// context, and updates it as necessary; each \c State derived class is +/// completely stateless. +/// +/// The initial transition takes place in a static method of the base class, +/// \c start(). This is mainly for implementation convenience; we need to +/// pass options given to \c MasterLexer::getNextToken() for the initial +/// state, so it makes more sense to separate the interface for the transition +/// from the initial state. +/// +/// When an object of a specific state class completes the session, it +/// normally sets the identified token in the lexer, and returns NULL; +/// if more transition is necessary, it returns a pointer to the next state +/// object. +/// +/// As is usual in the state design pattern, the \c State class is made +/// a friend class of \c MasterLexer and can refer to its internal details. +/// This is intentional; essentially its a part of \c MasterLexer and +/// is defined as a separate class only for implementation clarity and better +/// testability. It's defined in a publicly visible header, but that's only +/// for testing purposes. No normal application or even no other classes of +/// this library are expected to use this class. +class State { +public: + /// \brief Begin state transitions to get the next token. + /// + /// This is the first method that \c MasterLexer needs to call for a + /// tokenization session. The lexer passes a reference to itself + /// and options given in \c getNextToken(). + /// + /// \throw InputSource::ReadError Unexpected I/O error + /// \throw std::bad_alloc Internal resource allocation failure + /// + /// \param lexer The lexer object that holds the main context. + /// \param options The options passed to getNextToken(). + /// \return A pointer to the next state object or NULL if the transition + /// is completed. + static const State* start(MasterLexer& lexer, + MasterLexer::Options options); + + /// \brief Handle the process of one specific state. + /// + /// This method is expected to be called on the object returned by + /// start(), and keep called on the returned object until NULL is + /// returned. The call chain will form the complete state transition. + /// + /// \throw InputSource::ReadError Unexpected I/O error + /// \throw std::bad_alloc Internal resource allocation failure + /// + /// \param lexer The lexer object that holds the main context. + /// \return A pointer to the next state object or NULL if the transition + /// is completed. + virtual const State* handle(MasterLexer& lexer) const = 0; + + /// \brief Types of states. + /// + /// Specific states are basically hidden within the implementation, + /// but we'd like to allow tests to examine them, so we provide + /// a way to get an instance of a specific state. + enum ID { + CRLF, ///< Just seen a carriage-return character + String ///< Handling a string token + }; + + /// \brief Returns a \c State instance of the given state. + /// + /// This is provided only for testing purposes so tests can check + /// the behavior of each state separately. \c MasterLexer shouldn't + /// need this method. + static const State& getInstance(ID state_id); + + /// \name Read-only accessors for testing purposes. + /// + /// These allow tests to inspect some selected portion of the internal + /// states of \c MasterLexer. These shouldn't be used except for testing + /// purposes. + ///@{ + bool wasLastEOL(const MasterLexer& lexer) const; + const MasterLexer::Token& getToken(const MasterLexer& lexer) const; + size_t getParenCount(const MasterLexer& lexer) const; + ///@} + +protected: + /// \brief An accessor to the internal implementation class of + /// \c MasterLexer. + /// + /// This is provided for specific derived classes as they are not direct + /// friends of \c MasterLexer. + /// + /// \param lexer The lexer object that holds the main context. + /// \return A pointer to the implementation class object of the given + /// lexer. This is never NULL. + MasterLexer::MasterLexerImpl* getLexerImpl(MasterLexer& lexer) const { + return (lexer.impl_); + } +}; + +} // namespace master_lexer_internal +} // namespace dns +} // namespace isc +#endif // MASTER_LEXER_STATE_H + +// Local Variables: +// mode: c++ +// End: diff --git a/src/lib/dns/tests/Makefile.am b/src/lib/dns/tests/Makefile.am index d5adc21e8f..33867da0c2 100644 --- a/src/lib/dns/tests/Makefile.am +++ b/src/lib/dns/tests/Makefile.am @@ -27,6 +27,7 @@ run_unittests_SOURCES += labelsequence_unittest.cc run_unittests_SOURCES += messagerenderer_unittest.cc run_unittests_SOURCES += master_lexer_token_unittest.cc run_unittests_SOURCES += master_lexer_unittest.cc +run_unittests_SOURCES += master_lexer_state_unittest.cc run_unittests_SOURCES += name_unittest.cc run_unittests_SOURCES += nsec3hash_unittest.cc run_unittests_SOURCES += rrclass_unittest.cc rrtype_unittest.cc diff --git a/src/lib/dns/tests/master_lexer_state_unittest.cc b/src/lib/dns/tests/master_lexer_state_unittest.cc new file mode 100644 index 0000000000..bcee7fd1ec --- /dev/null +++ b/src/lib/dns/tests/master_lexer_state_unittest.cc @@ -0,0 +1,256 @@ +// Copyright (C) 2012 Internet Systems Consortium, Inc. ("ISC") +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH +// REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY +// AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, +// INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +// LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +// PERFORMANCE OF THIS SOFTWARE. + +#include <dns/master_lexer.h> +#include <dns/master_lexer_inputsource.h> +#include <dns/master_lexer_state.h> + +#include <gtest/gtest.h> + +#include <sstream> + +using namespace isc::dns; +using namespace master_lexer_internal; + +namespace { +typedef MasterLexer::Token Token; // shortcut + +class MasterLexerStateTest : public ::testing::Test { +protected: + MasterLexerStateTest() : common_options(MasterLexer::INITIAL_WS), + s_null(NULL), + s_crlf(State::getInstance(State::CRLF)), + s_string(State::getInstance(State::String)), + options(MasterLexer::NONE), + orig_options(options) + {} + + // Specify INITIAL_WS as common initial options. + const MasterLexer::Options common_options; + MasterLexer lexer; + const State* const s_null; + const State& s_crlf; + const State& s_string; + std::stringstream ss; + MasterLexer::Options options, orig_options; +}; + +// Common check for the end-of-file condition. +// Token is set to END_OF_FILE, and the lexer was NOT last eol state. +// Passed state can be any valid one; they are stateless, just providing the +// interface for inspection. +void +eofCheck(const State& state, MasterLexer& lexer) { + EXPECT_EQ(Token::END_OF_FILE, state.getToken(lexer).getType()); + EXPECT_FALSE(state.wasLastEOL(lexer)); +} + +TEST_F(MasterLexerStateTest, startAndEnd) { + // A simple case: the input is empty, so we begin with start and + // are immediately done. + lexer.pushSource(ss); + EXPECT_EQ(s_null, State::start(lexer, common_options)); + eofCheck(s_crlf, lexer); +} + +TEST_F(MasterLexerStateTest, startToEOL) { + ss << "\n"; + lexer.pushSource(ss); + + EXPECT_EQ(s_null, State::start(lexer, common_options)); + EXPECT_TRUE(s_crlf.wasLastEOL(lexer)); + EXPECT_EQ(Token::END_OF_LINE, s_crlf.getToken(lexer).getType()); + + // The next lexer session will reach EOF. Same eof check should pass. + EXPECT_EQ(s_null, State::start(lexer, common_options)); + eofCheck(s_crlf, lexer); +} + +TEST_F(MasterLexerStateTest, space) { + // repeat '\t\n' twice (see below), then space after EOL + ss << " \t\n\t\n "; + lexer.pushSource(ss); + + // by default space characters and tabs will be ignored. We check this + // twice; at the second iteration, it's a white space at the beginning + // of line, but since we don't specify INITIAL_WS option, it's treated as + // normal space and ignored. + for (size_t i = 0; i < 2; ++i) { + EXPECT_EQ(s_null, State::start(lexer, MasterLexer::NONE)); + EXPECT_TRUE(s_crlf.wasLastEOL(lexer)); + EXPECT_EQ(Token::END_OF_LINE, s_crlf.getToken(lexer).getType()); + } + + // Now we specify the INITIAL_WS option. It will be recognized and the + // corresponding token will be returned. + EXPECT_EQ(s_null, State::start(lexer, MasterLexer::INITIAL_WS)); + EXPECT_FALSE(s_crlf.wasLastEOL(lexer)); + EXPECT_EQ(Token::INITIAL_WS, s_crlf.getToken(lexer).getType()); +} + +TEST_F(MasterLexerStateTest, parentheses) { + ss << "\n(\na\n )\n "; // 1st \n is to check if 'was EOL' is set to false + lexer.pushSource(ss); + + EXPECT_EQ(s_null, State::start(lexer, common_options)); // handle \n + + // Now handle '('. It skips \n and recognize 'a' as string + EXPECT_EQ(0, s_crlf.getParenCount(lexer)); // check pre condition + EXPECT_EQ(&s_string, State::start(lexer, common_options)); + EXPECT_EQ(1, s_crlf.getParenCount(lexer)); // check post condition + EXPECT_FALSE(s_crlf.wasLastEOL(lexer)); + + // skip 'a' (note: until #2373 it's actually skipped as part of the '(' + // handling) + s_string.handle(lexer); + + // Then handle ')'. '\n' before ')' isn't recognized because + // it's canceled due to the '('. Likewise, the space after the '\n' + // shouldn't be recognized but should be just ignored. + EXPECT_EQ(s_null, State::start(lexer, common_options)); + EXPECT_EQ(0, s_crlf.getParenCount(lexer)); + + // Now, temporarily disabled options are restored: Both EOL and the + // initial WS are recognized + EXPECT_EQ(Token::END_OF_LINE, s_crlf.getToken(lexer).getType()); + EXPECT_EQ(s_null, State::start(lexer, common_options)); + EXPECT_EQ(Token::INITIAL_WS, s_crlf.getToken(lexer).getType()); +} + +TEST_F(MasterLexerStateTest, nestedParentheses) { + // This is an unusual, but allowed (in this implementation) case. + ss << "(a(b)\n c)\n "; + lexer.pushSource(ss); + + EXPECT_EQ(&s_string, State::start(lexer, common_options)); // consume '(' + s_string.handle(lexer); // consume 'a' + EXPECT_EQ(&s_string, State::start(lexer, common_options)); // consume '(' + s_string.handle(lexer); // consume 'b' + EXPECT_EQ(2, s_crlf.getParenCount(lexer)); // now the count is 2 + + // Close the inner most parentheses. count will be decreased, but option + // shouldn't be restored yet, so the intermediate EOL or initial WS won't + // be recognized. + EXPECT_EQ(&s_string, State::start(lexer, common_options)); // consume ')' + s_string.handle(lexer); // consume 'c' + EXPECT_EQ(1, s_crlf.getParenCount(lexer)); + + // Close the outermost parentheses. count will be reset to 0, and original + // options are restored. + EXPECT_EQ(s_null, State::start(lexer, common_options)); + + // Now, temporarily disabled options are restored: Both EOL and the + // initial WS are recognized + EXPECT_EQ(Token::END_OF_LINE, s_crlf.getToken(lexer).getType()); + EXPECT_EQ(s_null, State::start(lexer, common_options)); + EXPECT_EQ(Token::INITIAL_WS, s_crlf.getToken(lexer).getType()); +} + +TEST_F(MasterLexerStateTest, unbalancedParentheses) { + // Only closing paren is provided. We prepend a \n to check if it's + // correctly canceled after detecting the error. + ss << "\n)"; + ss << "(a"; + lexer.pushSource(ss); + + EXPECT_EQ(s_null, State::start(lexer, common_options)); // consume '\n' + EXPECT_TRUE(s_crlf.wasLastEOL(lexer)); // this \n was remembered + + // Now checking ')'. The result should be error, count shouldn't be + // changed. "last EOL" should be canceled. + EXPECT_EQ(0, s_crlf.getParenCount(lexer)); + EXPECT_EQ(s_null, State::start(lexer, common_options)); + EXPECT_EQ(0, s_crlf.getParenCount(lexer)); + ASSERT_EQ(Token::ERROR, s_crlf.getToken(lexer).getType()); + EXPECT_EQ(Token::UNBALANCED_PAREN, s_crlf.getToken(lexer).getErrorCode()); + EXPECT_FALSE(s_crlf.wasLastEOL(lexer)); + + // Reach EOF with a dangling open parenthesis. + EXPECT_EQ(&s_string, State::start(lexer, common_options)); // consume '(' + s_string.handle(lexer); // consume 'a' + EXPECT_EQ(1, s_crlf.getParenCount(lexer)); + EXPECT_EQ(s_null, State::start(lexer, common_options)); // reach EOF + ASSERT_EQ(Token::ERROR, s_crlf.getToken(lexer).getType()); + EXPECT_EQ(Token::UNBALANCED_PAREN, s_crlf.getToken(lexer).getErrorCode()); + EXPECT_EQ(0, s_crlf.getParenCount(lexer)); // should be reset to 0 +} + +TEST_F(MasterLexerStateTest, startToComment) { + // Begin with 'start', skip space, then encounter a comment. Skip + // the rest of the line, and recognize the new line. Note that the + // second ';' is simply ignored. + ss << " ;a;\n"; + ss << ";a;"; // Likewise, but the comment ends with EOF. + lexer.pushSource(ss); + + // Comment ending with EOL + EXPECT_EQ(s_null, State::start(lexer, common_options)); + EXPECT_EQ(Token::END_OF_LINE, s_crlf.getToken(lexer).getType()); + + // Comment ending with EOF + EXPECT_EQ(s_null, State::start(lexer, common_options)); + EXPECT_EQ(Token::END_OF_FILE, s_crlf.getToken(lexer).getType()); +} + +TEST_F(MasterLexerStateTest, commentAfterParen) { + // comment after an opening parenthesis. The code that is tested by + // other tests should also ensure that it works correctly, but we + // check it explicitly. + ss << "( ;this is a comment\na)\n"; + lexer.pushSource(ss); + + // consume '(', skip comments, consume 'a', then consume ')' + EXPECT_EQ(&s_string, State::start(lexer, common_options)); + s_string.handle(lexer); + EXPECT_EQ(s_null, State::start(lexer, common_options)); + EXPECT_EQ(Token::END_OF_LINE, s_crlf.getToken(lexer).getType()); +} + +TEST_F(MasterLexerStateTest, crlf) { + ss << "\r\n"; // case 1 + ss << "\r "; // case 2 + ss << "\r;comment\na"; // case 3 + ss << "\r"; // case 4 + lexer.pushSource(ss); + + // 1. A sequence of \r, \n is recognized as a single 'end-of-line' + EXPECT_EQ(&s_crlf, State::start(lexer, common_options)); // recognize '\r' + EXPECT_EQ(s_null, s_crlf.handle(lexer)); // recognize '\n' + EXPECT_EQ(Token::END_OF_LINE, s_crlf.getToken(lexer).getType()); + EXPECT_TRUE(s_crlf.wasLastEOL(lexer)); + + // 2. Single '\r' (not followed by \n) is recognized as a single + // 'end-of-line'. then there will be "initial WS" + EXPECT_EQ(&s_crlf, State::start(lexer, common_options)); // recognize '\r' + // see ' ', "unget" it + EXPECT_EQ(s_null, s_crlf.handle(lexer)); + EXPECT_EQ(s_null, State::start(lexer, common_options)); // recognize ' ' + EXPECT_EQ(Token::INITIAL_WS, s_crlf.getToken(lexer).getType()); + + // 3. comment between \r and \n + EXPECT_EQ(&s_crlf, State::start(lexer, common_options)); // recognize '\r' + // skip comments, recognize '\n' + EXPECT_EQ(s_null, s_crlf.handle(lexer)); + EXPECT_EQ(Token::END_OF_LINE, s_crlf.getToken(lexer).getType()); + EXPECT_EQ(&s_string, State::start(lexer, common_options)); + + // 4. \r then EOF + EXPECT_EQ(&s_crlf, State::start(lexer, common_options)); // recognize '\r' + // see EOF, then "unget" it + EXPECT_EQ(s_null, s_crlf.handle(lexer)); + EXPECT_EQ(s_null, State::start(lexer, common_options)); // recognize EOF + EXPECT_EQ(Token::END_OF_FILE, s_crlf.getToken(lexer).getType()); +} + +} |