klee
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Lexer.cpp
Go to the documentation of this file.
1 //===-- Lexer.cpp ---------------------------------------------------------===//
2 //
3 // The KLEE Symbolic Virtual Machine
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
10 #include "expr/Lexer.h"
11 
12 #include "llvm/Support/MemoryBuffer.h"
13 #include "llvm/Support/raw_ostream.h"
14 
15 #include <iomanip>
16 #include <string.h>
17 
18 using namespace llvm;
19 using namespace klee;
20 using namespace klee::expr;
21 
23 
24 const char *Token::getKindName() const {
25  switch (kind) {
26  default:
27  case Unknown: return "Unknown";
28  case Arrow: return "Arrow";
29  case At: return "At";
30  case Colon: return "Colon";
31  case Comma: return "Comma";
32  case Comment: return "Comment";
33  case EndOfFile: return "EndOfFile";
34  case Equals: return "Equals";
35  case Identifier: return "Identifier";
36  case KWArray: return "KWArray";
37  case KWFalse: return "KWFalse";
38  case KWQuery: return "KWQuery";
39  case KWReserved: return "KWReserved";
40  case KWSymbolic: return "KWSymbolic";
41  case KWTrue: return "KWTrue";
42  case KWWidth: return "KWWidth";
43  case LBrace: return "LBrace";
44  case LParen: return "LParen";
45  case LSquare: return "LSquare";
46  case Number: return "Number";
47  case RBrace: return "RBrace";
48  case RParen: return "RParen";
49  case RSquare: return "RSquare";
50  case Semicolon: return "Semicolon";
51  }
52 }
53 
54 void Token::dump() {
55  llvm::errs() << "(Token \"" << getKindName() << "\" "
56  << (const void*) start << " " << length << " "
57  << line << " " << column << ")";
58 }
59 
61 
62 static inline bool isInternalIdentifierChar(int Char) {
63  return isalnum(Char) || Char == '_' || Char == '.' || Char == '-';
64 }
65 
66 Lexer::Lexer(const llvm::MemoryBuffer *MB)
67  : BufferPos(MB->getBufferStart()), BufferEnd(MB->getBufferEnd()),
68  LineNumber(1), ColumnNumber(0) {
69 }
70 
72 }
73 
75  if (BufferPos == BufferEnd)
76  return -1;
77  return *BufferPos;
78 }
79 
81  if (BufferPos == BufferEnd)
82  return -1;
83 
84  // Handle DOS/Mac newlines here, by stripping duplicates and by
85  // returning '\n' for both.
86  char Result = *BufferPos++;
87  if (Result == '\n' || Result == '\r') {
88  if (BufferPos != BufferEnd && *BufferPos == ('\n' + '\r' - Result))
89  ++BufferPos;
90  Result = '\n';
91  }
92 
93  if (Result == '\n') {
94  ++LineNumber;
95  ColumnNumber = 0;
96  } else {
97  ++ColumnNumber;
98  }
99 
100  return Result;
101 }
102 
104  Result.kind = k;
105  Result.length = BufferPos - Result.start;
106  return Result;
107 }
108 
109 static bool isReservedKW(const char *Str, unsigned N) {
110  unsigned i;
111 
112  // Check for i[0-9]+
113  if (N>1 && Str[0] == 'i') {
114  for (i=1; i<N; ++i)
115  if (!isdigit(Str[i]))
116  break;
117  if (i==N)
118  return true;
119  }
120 
121  // Check for fp[0-9]+([.].*)?$
122  if (N>3 && Str[0]=='f' && Str[1]=='p' && isdigit(Str[2])) {
123  for (i=3; i<N; ++i)
124  if (!isdigit(Str[i]))
125  break;
126  if (i==N || Str[i]=='.')
127  return true;
128  }
129 
130  return false;
131 }
132 static bool isWidthKW(const char *Str, unsigned N) {
133  if (N<2 || Str[0] != 'w')
134  return false;
135  for (unsigned i=1; i<N; ++i)
136  if (!isdigit(Str[i]))
137  return false;
138  return true;
139 }
141  unsigned Length = BufferPos - Result.start;
142  switch (Length) {
143  case 3:
144  if (memcmp("def", Result.start, 3) == 0)
145  return SetTokenKind(Result, Token::KWReserved);
146  if (memcmp("var", Result.start, 3) == 0)
147  return SetTokenKind(Result, Token::KWReserved);
148  break;
149 
150  case 4:
151  if (memcmp("true", Result.start, 4) == 0)
152  return SetTokenKind(Result, Token::KWTrue);
153  break;
154 
155  case 5:
156  if (memcmp("array", Result.start, 5) == 0)
157  return SetTokenKind(Result, Token::KWArray);
158  if (memcmp("false", Result.start, 5) == 0)
159  return SetTokenKind(Result, Token::KWFalse);
160  if (memcmp("query", Result.start, 5) == 0)
161  return SetTokenKind(Result, Token::KWQuery);
162  break;
163 
164  case 6:
165  if (memcmp("define", Result.start, 6) == 0)
166  return SetTokenKind(Result, Token::KWReserved);
167  break;
168 
169  case 7:
170  if (memcmp("declare", Result.start, 7) == 0)
171  return SetTokenKind(Result, Token::KWReserved);
172  break;
173 
174  case 8:
175  if (memcmp("symbolic", Result.start, 8) == 0)
176  return SetTokenKind(Result, Token::KWSymbolic);
177  break;
178  }
179 
180  if (isReservedKW(Result.start, Length))
181  return SetTokenKind(Result, Token::KWReserved);
182  if (isWidthKW(Result.start, Length))
183  return SetTokenKind(Result, Token::KWWidth);
184 
185  return SetTokenKind(Result, Token::Identifier);
186 }
187 
189  for (;;) {
190  int Char = GetNextChar();
191  if (Char == -1 || Char =='\n')
192  break;
193  }
194 }
195 
197  while (isalnum(PeekNextChar()) || PeekNextChar()=='_')
198  GetNextChar();
199  return SetTokenKind(Result, Token::Number);
200 }
201 
204  GetNextChar();
205 
206  // Recognize keywords specially.
207  return SetIdentifierTokenKind(Result);
208 }
209 
210 Token &Lexer::Lex(Token &Result) {
211  Result.kind = Token::Unknown;
212  Result.length = 0;
213  Result.start = BufferPos;
214 
215  // Skip whitespace.
216  while (isspace(PeekNextChar()))
217  GetNextChar();
218 
219  Result.start = BufferPos;
220  Result.line = LineNumber;
221  Result.column = ColumnNumber;
222  int Char = GetNextChar();
223  switch (Char) {
224  case -1: return SetTokenKind(Result, Token::EndOfFile);
225 
226  case '(': return SetTokenKind(Result, Token::LParen);
227  case ')': return SetTokenKind(Result, Token::RParen);
228  case ',': return SetTokenKind(Result, Token::Comma);
229  case ':': return SetTokenKind(Result, Token::Colon);
230  case ';': return SetTokenKind(Result, Token::Semicolon);
231  case '=': return SetTokenKind(Result, Token::Equals);
232  case '@': return SetTokenKind(Result, Token::At);
233  case '[': return SetTokenKind(Result, Token::LSquare);
234  case ']': return SetTokenKind(Result, Token::RSquare);
235  case '{': return SetTokenKind(Result, Token::LBrace);
236  case '}': return SetTokenKind(Result, Token::RBrace);
237 
238  case '#':
239  SkipToEndOfLine();
240  return SetTokenKind(Result, Token::Comment);
241 
242  case '+': {
243  if (isdigit(PeekNextChar()))
244  return LexNumber(Result);
245  else
246  return SetTokenKind(Result, Token::Unknown);
247  }
248 
249  case '-': {
250  int Next = PeekNextChar();
251  if (Next == '>')
252  return GetNextChar(), SetTokenKind(Result, Token::Arrow);
253  else if (isdigit(Next))
254  return LexNumber(Result);
255  else
256  return SetTokenKind(Result, Token::Unknown);
257  break;
258  }
259 
260  default:
261  if (isdigit(Char))
262  return LexNumber(Result);
263  else if (isalpha(Char) || Char == '_')
264  return LexIdentifier(Result);
265  return SetTokenKind(Result, Token::Unknown);
266  }
267 }
unsigned LineNumber
The buffer end position.
Definition: Lexer.h:79
unsigned line
The length of the token.
Definition: Lexer.h:55
static bool isWidthKW(const char *Str, unsigned N)
Definition: Lexer.cpp:132
int PeekNextChar()
Definition: Lexer.cpp:74
Token & SetTokenKind(Token &Result, Token::Kind k)
Definition: Lexer.cpp:103
Token & LexNumber(Token &Result)
LexNumber - Lex a number which does not have a base specifier.
Definition: Lexer.cpp:196
int GetNextChar()
The current column.
Definition: Lexer.cpp:80
unsigned length
The beginning of the token string.
Definition: Lexer.h:54
[+-]?[0-9][a-zA-Z0-9_]+
Definition: Lexer.h:41
fp[0-9]+([.].*)?, i[0-9]+
Definition: Lexer.h:34
static bool isInternalIdentifierChar(int Char)
Definition: Lexer.cpp:62
Token & SetIdentifierTokenKind(Token &Result)
Definition: Lexer.cpp:140
const char * start
The token kind.
Definition: Lexer.h:53
static bool isReservedKW(const char *Str, unsigned N)
Definition: Lexer.cpp:109
unsigned column
The line number of the start of this token.
Definition: Lexer.h:56
Identifier - Wrapper for a uniqued string.
Definition: Parser.h:31
const char * BufferPos
Definition: Lexer.h:77
unsigned ColumnNumber
The current line.
Definition: Lexer.h:80
int line
Definition: klee.h:68
Token & LexIdentifier(Token &Result)
LexIdentifier - Lex an identifier.
Definition: Lexer.cpp:202
const char * BufferEnd
The current lexer position.
Definition: Lexer.h:78
[a-zA-Z_][a-zA-Z0-9._]*
Definition: Lexer.h:30
Token & Lex(Token &Result)
Definition: Lexer.cpp:210
void SkipToEndOfLine()
Definition: Lexer.cpp:188