AbstractScanner.hpp
Go to the documentation of this file.
1 // @formatter:off
2 //
3 // Balau core C++ library
4 //
5 // Copyright (C) 2008 Bora Software (contact@borasoftware.com)
6 //
7 // Licensed under the Boost Software License - Version 1.0 - August 17th, 2003.
8 // See the LICENSE file for the full license text.
9 //
10 
16 
17 #ifndef COM_BORA_SOFTWARE__BALAU_LANG__ABSTRACT_SCANNER
18 #define COM_BORA_SOFTWARE__BALAU_LANG__ABSTRACT_SCANNER
19 
23 #include <Balau/Util/Streams.hpp>
24 
25 #include <stack>
26 
27 namespace Balau::Lang {
28 
50 template <typename TokenT> class AbstractScanner {
55  currentStartOffset = 0U;
56  currentEndOffset = 0U;
57  TokenT token = getNextToken();
58 
59  while (token != TokenT::EndOfFile) {
60  tokens.emplace_back(token);
61  startOffsets.emplace_back(currentStartOffset);
62  currentStartOffset = currentEndOffset;
63  token = getNextToken();
64  }
65 
66  // EOF
67  tokens.emplace_back(token);
68  startOffsets.emplace_back(currentStartOffset);
69 
70  return ScannedTokens<TokenT>(uri, std::move(text), std::move(tokens), std::move(startOffsets));
71  }
72 
74 
75  protected: explicit AbstractScanner(std::shared_ptr<Resource::Uri> uri_)
76  : currentChar(0)
77  , uri(std::move(uri_))
78  , text(::toString(uri->byteReadResource()->readStream()))
79  , currentEndOffset(0) {}
80 
81  protected: AbstractScanner(std::shared_ptr<Resource::Uri> uri_, std::istream & inputStream_)
82  : currentChar(0)
83  , uri(std::move(uri_))
84  , text(::toString(inputStream_))
85  , currentEndOffset(0) {}
86 
87  protected: void readNextChar() {
88  if (currentEndOffset == (int) text.length()) {
89  currentChar = std::char_traits<char32_t>::eof();
90  } else {
91  currentChar = Character::getNextUtf8(text, currentEndOffset);
92 
93  if (currentChar < 0) {
96  , "Invalid UTF-8 text found during parsing of string literal", calculateCurrentCodeSpan()
97  );
98  }
99  }
100  }
101 
102  // Get a string view of the current token's string. If the token is partly parsed, this view is partial.
103  protected: std::string_view getCurrentString() {
104  return std::string_view(text.data() + currentStartOffset, currentEndOffset - currentStartOffset);
105  }
106 
107  protected: template <typename ContainerT, typename ReportT>
108  bool readNextChar(ContainerT & container, const std::function<ReportT (const CodeSpan &)> & errorReport) {
109  if (currentEndOffset == (int) text.length()) {
110  currentChar = std::char_traits<char32_t>::eof();
111  } else {
112  currentChar = Character::getNextUtf8(text, currentEndOffset);
113 
114  if (currentChar < 0) {
115  container.push_back(errorReport(calculateCurrentCodeSpan()));
116  return false;
117  }
118  }
119 
120  return true;
121  }
122 
123  protected: void putBackCurrentChar() {
124  if (currentChar == std::char_traits<char32_t>::eof() || text.length() == 0 || currentEndOffset == 0) {
125  return;
126  }
127 
128  if (currentEndOffset > 1) {
129  Character::retreatUtf8(text, currentEndOffset);
130  Character::retreatUtf8(text, currentEndOffset);
131  currentChar = Character::getNextUtf8(text, currentEndOffset);
132  } else if (currentEndOffset == 1) {
133  Character::retreatUtf8(text, currentEndOffset);
134  currentChar = 0;
135  } else { // currentEndOffset == 0
136  // NOP
137  }
138  }
139 
140  protected: void extractStringConstantDoubleQuotes() {
141  // The current character must already be the double quote character.
142  while (true) {
143  readNextChar();
144 
145  switch (currentChar) {
146  case U'"': {
147  return;
148  }
149 
150  case U'\\': { // Escaped character.
151  readNextChar();
152  continue;
153  }
154 
155  case std::char_traits<char32_t>::eof(): {
158  , "End of file found during parsing of string literal", calculateCurrentCodeSpan()
159  );
160  }
161 
162  default: {
163  continue;
164  }
165  }
166  }
167  }
168 
169  protected: template <typename ContainerT, typename ReportT>
170  bool extractStringConstantDoubleQuotes(ContainerT & container, const std::function<ReportT (const CodeSpan &)> & errorReport) {
171  // The current character must already be the double quote character.
172  while (true) {
173  readNextChar();
174 
175  switch (currentChar) {
176  case U'"': {
177  return true;
178  }
179 
180  case U'\\': { // Escaped character.
181  readNextChar();
182  continue;
183  }
184 
185  case std::char_traits<char32_t>::eof(): {
186  container.push_back(errorReport(calculateCurrentCodeSpan()));
187  return false;
188  }
189 
190  default: {
191  continue;
192  }
193  }
194  }
195  }
196 
197  protected: TokenT createWhitespaceToken() {
198  // The current character must already be a blank or line break character.
199 
200  if (Balau::Character::isBlank(currentChar)) {
201  return createBlankToken();
202  } else {
203  return createLineBreakToken();
204  }
205  }
206 
207  protected: TokenT createBlankToken() {
208  // The current character must already be a blank character.
209 
210  while (Balau::Character::isBlank(currentChar)) {
211  readNextChar();
212  }
213 
214  putBackCurrentChar();
215  return TokenT::Blank;
216  }
217 
218  protected: TokenT createLineBreakToken() {
219  // The current character must already be a line break character.
220 
221  if (currentChar == '\n') {
222  readNextChar();
223 
224  if (currentChar != '\r') {
225  putBackCurrentChar();
226  }
227  } else { // currentChar == '\r'
228  readNextChar();
229 
230  if (currentChar != '\n') {
231  putBackCurrentChar();
232  }
233  }
234 
235  return TokenT::LineBreak;
236  }
237 
238  protected: CodeSpan calculateCurrentCodeSpan() {
240  text, tokens, startOffsets, (unsigned int) tokens.size() - 1
241  );
242  }
243 
244  //
245  // This method is implemented by derived classes and contains the actual scanning logic.
246  //
247  // Derived classes should parse the next characters and return the token type.
248  // The start offsets are handled by the base class.
249  //
250  private: virtual TokenT getNextToken() = 0;
251 
252  protected: char32_t currentChar;
253 
254  private: std::shared_ptr<Resource::Uri> uri;
255  private: std::string text;
256  private: std::istringstream inputStream;
257  private: std::stack<char32_t> putBackBuffer;
258  private: int currentStartOffset;
259  private: int currentEndOffset;
260  private: std::vector<TokenT> tokens;
261  private: std::vector<unsigned int> startOffsets;
262 };
263 
264 } // namespace Balau::Lang
265 
266 #endif // COM_BORA_SOFTWARE__BALAU_LANG__ABSTRACT_SCANNER
Thrown when a parser incurs invalid syntax.
Definition: ParsingExceptions.hpp:28
Base class for scanners.
Definition: AbstractScanner.hpp:50
ScannedTokens< TokenT > scan()
Scan the input and return a ScannedTokens data structure.
Definition: AbstractScanner.hpp:54
#define ThrowBalauException(ExceptionClass,...)
Throw a Balau style exception, with implicit file and line number, and optional stacktrace.
Definition: BalauException.hpp:45
Encapsulation of a set of tokens, text, and offsets plus various API wrappers.
Information on the span of some source code text.
Definition: CodeSpan.hpp:91
Utilities for streams.
static char32_t getNextUtf8(const std::string_view &text, int &offset)
Get the next code point from the UTF-8 string view.
Definition: Character.hpp:179
static bool isBlank(char32_t c)
Is the specified code point a character that visibly separates words on a line.
Definition: Character.hpp:113
Encapsulation of a set of language tokens, source text, and start offsets.
Definition: ScannedTokens.hpp:47
Parsing tools and parser implementations.
Definition: AbstractScanner.hpp:27
Balau::U8String< AllocatorT > toString(const CodeSpan &codeSpan)
Print the supplied code span as a UTF-8 string.
Definition: CodeSpan.hpp:235
A position in a piece of multi-line text.
static CodeSpan determineCodeSpan(const ScannedTokens< TokenT > &scannedTokens, unsigned int index)
Utility to determine the overall code span for the specified index.
Definition: ScannedTokens.hpp:199
static void retreatUtf8(const std::string_view &text, int &offset)
Retreat the supplied offset from one code point boundary to the previous one.
Definition: Character.hpp:274
A read-only resource which is read as bytes.