tokenize.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. #!/usr/bin/env python
  2. #
  3. # Copyright 2007 Neal Norwitz
  4. # Portions Copyright 2007 Google Inc.
  5. #
  6. # Licensed under the Apache License, Version 2.0 (the "License");
  7. # you may not use this file except in compliance with the License.
  8. # You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. """Tokenize C++ source code."""
  18. try:
  19. # Python 3.x
  20. import builtins
  21. except ImportError:
  22. # Python 2.x
  23. import __builtin__ as builtins
  24. import sys
  25. from cpp import utils
  26. if not hasattr(builtins, 'set'):
  27. # Nominal support for Python 2.3.
  28. from sets import Set as set
  29. # Add $ as a valid identifier char since so much code uses it.
  30. _letters = 'abcdefghijklmnopqrstuvwxyz'
  31. VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
  32. HEX_DIGITS = set('0123456789abcdefABCDEF')
  33. INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
  34. # C++0x string preffixes.
  35. _STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
  36. # Token types.
  37. UNKNOWN = 'UNKNOWN'
  38. SYNTAX = 'SYNTAX'
  39. CONSTANT = 'CONSTANT'
  40. NAME = 'NAME'
  41. PREPROCESSOR = 'PREPROCESSOR'
  42. # Where the token originated from. This can be used for backtracking.
  43. # It is always set to WHENCE_STREAM in this code.
  44. WHENCE_STREAM, WHENCE_QUEUE = range(2)
  45. class Token(object):
  46. """Data container to represent a C++ token.
  47. Tokens can be identifiers, syntax char(s), constants, or
  48. pre-processor directives.
  49. start contains the index of the first char of the token in the source
  50. end contains the index of the last char of the token in the source
  51. """
  52. def __init__(self, token_type, name, start, end):
  53. self.token_type = token_type
  54. self.name = name
  55. self.start = start
  56. self.end = end
  57. self.whence = WHENCE_STREAM
  58. def __str__(self):
  59. if not utils.DEBUG:
  60. return 'Token(%r)' % self.name
  61. return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
  62. __repr__ = __str__
  63. def _GetString(source, start, i):
  64. i = source.find('"', i+1)
  65. while source[i-1] == '\\':
  66. # Count the trailing backslashes.
  67. backslash_count = 1
  68. j = i - 2
  69. while source[j] == '\\':
  70. backslash_count += 1
  71. j -= 1
  72. # When trailing backslashes are even, they escape each other.
  73. if (backslash_count % 2) == 0:
  74. break
  75. i = source.find('"', i+1)
  76. return i + 1
  77. def _GetChar(source, start, i):
  78. # NOTE(nnorwitz): may not be quite correct, should be good enough.
  79. i = source.find("'", i+1)
  80. while source[i-1] == '\\':
  81. # Need to special case '\\'.
  82. if (i - 2) > start and source[i-2] == '\\':
  83. break
  84. i = source.find("'", i+1)
  85. # Try to handle unterminated single quotes (in a #if 0 block).
  86. if i < 0:
  87. i = start
  88. return i + 1
  89. def GetTokens(source):
  90. """Returns a sequence of Tokens.
  91. Args:
  92. source: string of C++ source code.
  93. Yields:
  94. Token that represents the next token in the source.
  95. """
  96. # Cache various valid character sets for speed.
  97. valid_identifier_chars = VALID_IDENTIFIER_CHARS
  98. hex_digits = HEX_DIGITS
  99. int_or_float_digits = INT_OR_FLOAT_DIGITS
  100. int_or_float_digits2 = int_or_float_digits | set('.')
  101. # Only ignore errors while in a #if 0 block.
  102. ignore_errors = False
  103. count_ifs = 0
  104. i = 0
  105. end = len(source)
  106. while i < end:
  107. # Skip whitespace.
  108. while i < end and source[i].isspace():
  109. i += 1
  110. if i >= end:
  111. return
  112. token_type = UNKNOWN
  113. start = i
  114. c = source[i]
  115. if c.isalpha() or c == '_': # Find a string token.
  116. token_type = NAME
  117. while source[i] in valid_identifier_chars:
  118. i += 1
  119. # String and character constants can look like a name if
  120. # they are something like L"".
  121. if (source[i] == "'" and (i - start) == 1 and
  122. source[start:i] in 'uUL'):
  123. # u, U, and L are valid C++0x character preffixes.
  124. token_type = CONSTANT
  125. i = _GetChar(source, start, i)
  126. elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
  127. token_type = CONSTANT
  128. i = _GetString(source, start, i)
  129. elif c == '/' and source[i+1] == '/': # Find // comments.
  130. i = source.find('\n', i)
  131. if i == -1: # Handle EOF.
  132. i = end
  133. continue
  134. elif c == '/' and source[i+1] == '*': # Find /* comments. */
  135. i = source.find('*/', i) + 2
  136. continue
  137. elif c in ':+-<>&|*=': # : or :: (plus other chars).
  138. token_type = SYNTAX
  139. i += 1
  140. new_ch = source[i]
  141. if new_ch == c and c != '>': # Treat ">>" as two tokens.
  142. i += 1
  143. elif c == '-' and new_ch == '>':
  144. i += 1
  145. elif new_ch == '=':
  146. i += 1
  147. elif c in '()[]{}~!?^%;/.,': # Handle single char tokens.
  148. token_type = SYNTAX
  149. i += 1
  150. if c == '.' and source[i].isdigit():
  151. token_type = CONSTANT
  152. i += 1
  153. while source[i] in int_or_float_digits:
  154. i += 1
  155. # Handle float suffixes.
  156. for suffix in ('l', 'f'):
  157. if suffix == source[i:i+1].lower():
  158. i += 1
  159. break
  160. elif c.isdigit(): # Find integer.
  161. token_type = CONSTANT
  162. if c == '0' and source[i+1] in 'xX':
  163. # Handle hex digits.
  164. i += 2
  165. while source[i] in hex_digits:
  166. i += 1
  167. else:
  168. while source[i] in int_or_float_digits2:
  169. i += 1
  170. # Handle integer (and float) suffixes.
  171. for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
  172. size = len(suffix)
  173. if suffix == source[i:i+size].lower():
  174. i += size
  175. break
  176. elif c == '"': # Find string.
  177. token_type = CONSTANT
  178. i = _GetString(source, start, i)
  179. elif c == "'": # Find char.
  180. token_type = CONSTANT
  181. i = _GetChar(source, start, i)
  182. elif c == '#': # Find pre-processor command.
  183. token_type = PREPROCESSOR
  184. got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
  185. if got_if:
  186. count_ifs += 1
  187. elif source[i:i+6] == '#endif':
  188. count_ifs -= 1
  189. if count_ifs == 0:
  190. ignore_errors = False
  191. # TODO(nnorwitz): handle preprocessor statements (\ continuations).
  192. while 1:
  193. i1 = source.find('\n', i)
  194. i2 = source.find('//', i)
  195. i3 = source.find('/*', i)
  196. i4 = source.find('"', i)
  197. # NOTE(nnorwitz): doesn't handle comments in #define macros.
  198. # Get the first important symbol (newline, comment, EOF/end).
  199. i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
  200. # Handle #include "dir//foo.h" properly.
  201. if source[i] == '"':
  202. i = source.find('"', i+1) + 1
  203. assert i > 0
  204. continue
  205. # Keep going if end of the line and the line ends with \.
  206. if not (i == i1 and source[i-1] == '\\'):
  207. if got_if:
  208. condition = source[start+4:i].lstrip()
  209. if (condition.startswith('0') or
  210. condition.startswith('(0)')):
  211. ignore_errors = True
  212. break
  213. i += 1
  214. elif c == '\\': # Handle \ in code.
  215. # This is different from the pre-processor \ handling.
  216. i += 1
  217. continue
  218. elif ignore_errors:
  219. # The tokenizer seems to be in pretty good shape. This
  220. # raise is conditionally disabled so that bogus code
  221. # in an #if 0 block can be handled. Since we will ignore
  222. # it anyways, this is probably fine. So disable the
  223. # exception and return the bogus char.
  224. i += 1
  225. else:
  226. sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
  227. ('?', i, c, source[i-10:i+10]))
  228. raise RuntimeError('unexpected token')
  229. if i <= 0:
  230. print('Invalid index, exiting now.')
  231. return
  232. yield Token(token_type, source[start:i], start, i)
  233. if __name__ == '__main__':
  234. def main(argv):
  235. """Driver mostly for testing purposes."""
  236. for filename in argv[1:]:
  237. source = utils.ReadFile(filename)
  238. if source is None:
  239. continue
  240. for token in GetTokens(source):
  241. print('%-12s: %s' % (token.token_type, token.name))
  242. # print('\r%6.2f%%' % (100.0 * index / token.end),)
  243. sys.stdout.write('\n')
  244. main(sys.argv)