tokenize.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. #!/usr/bin/env python
  2. #
  3. # Copyright 2007 Neal Norwitz
  4. # Portions Copyright 2007 Google Inc.
  5. #
  6. # Licensed under the Apache License, Version 2.0 (the "License");
  7. # you may not use this file except in compliance with the License.
  8. # You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. """Tokenize C++ source code."""
  18. __author__ = 'nnorwitz@google.com (Neal Norwitz)'
  19. try:
  20. # Python 3.x
  21. import builtins
  22. except ImportError:
  23. # Python 2.x
  24. import __builtin__ as builtins
  25. import sys
  26. from cpp import utils
  27. if not hasattr(builtins, 'set'):
  28. # Nominal support for Python 2.3.
  29. from sets import Set as set
  30. # Add $ as a valid identifier char since so much code uses it.
  31. _letters = 'abcdefghijklmnopqrstuvwxyz'
  32. VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
  33. HEX_DIGITS = set('0123456789abcdefABCDEF')
  34. INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
  35. # C++0x string preffixes.
  36. _STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
  37. # Token types.
  38. UNKNOWN = 'UNKNOWN'
  39. SYNTAX = 'SYNTAX'
  40. CONSTANT = 'CONSTANT'
  41. NAME = 'NAME'
  42. PREPROCESSOR = 'PREPROCESSOR'
  43. # Where the token originated from. This can be used for backtracking.
  44. # It is always set to WHENCE_STREAM in this code.
  45. WHENCE_STREAM, WHENCE_QUEUE = range(2)
  46. class Token(object):
  47. """Data container to represent a C++ token.
  48. Tokens can be identifiers, syntax char(s), constants, or
  49. pre-processor directives.
  50. start contains the index of the first char of the token in the source
  51. end contains the index of the last char of the token in the source
  52. """
  53. def __init__(self, token_type, name, start, end):
  54. self.token_type = token_type
  55. self.name = name
  56. self.start = start
  57. self.end = end
  58. self.whence = WHENCE_STREAM
  59. def __str__(self):
  60. if not utils.DEBUG:
  61. return 'Token(%r)' % self.name
  62. return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
  63. __repr__ = __str__
  64. def _GetString(source, start, i):
  65. i = source.find('"', i+1)
  66. while source[i-1] == '\\':
  67. # Count the trailing backslashes.
  68. backslash_count = 1
  69. j = i - 2
  70. while source[j] == '\\':
  71. backslash_count += 1
  72. j -= 1
  73. # When trailing backslashes are even, they escape each other.
  74. if (backslash_count % 2) == 0:
  75. break
  76. i = source.find('"', i+1)
  77. return i + 1
  78. def _GetChar(source, start, i):
  79. # NOTE(nnorwitz): may not be quite correct, should be good enough.
  80. i = source.find("'", i+1)
  81. while source[i-1] == '\\':
  82. # Need to special case '\\'.
  83. if (i - 2) > start and source[i-2] == '\\':
  84. break
  85. i = source.find("'", i+1)
  86. # Try to handle unterminated single quotes (in a #if 0 block).
  87. if i < 0:
  88. i = start
  89. return i + 1
  90. def GetTokens(source):
  91. """Returns a sequence of Tokens.
  92. Args:
  93. source: string of C++ source code.
  94. Yields:
  95. Token that represents the next token in the source.
  96. """
  97. # Cache various valid character sets for speed.
  98. valid_identifier_chars = VALID_IDENTIFIER_CHARS
  99. hex_digits = HEX_DIGITS
  100. int_or_float_digits = INT_OR_FLOAT_DIGITS
  101. int_or_float_digits2 = int_or_float_digits | set('.')
  102. # Only ignore errors while in a #if 0 block.
  103. ignore_errors = False
  104. count_ifs = 0
  105. i = 0
  106. end = len(source)
  107. while i < end:
  108. # Skip whitespace.
  109. while i < end and source[i].isspace():
  110. i += 1
  111. if i >= end:
  112. return
  113. token_type = UNKNOWN
  114. start = i
  115. c = source[i]
  116. if c.isalpha() or c == '_': # Find a string token.
  117. token_type = NAME
  118. while source[i] in valid_identifier_chars:
  119. i += 1
  120. # String and character constants can look like a name if
  121. # they are something like L"".
  122. if (source[i] == "'" and (i - start) == 1 and
  123. source[start:i] in 'uUL'):
  124. # u, U, and L are valid C++0x character preffixes.
  125. token_type = CONSTANT
  126. i = _GetChar(source, start, i)
  127. elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
  128. token_type = CONSTANT
  129. i = _GetString(source, start, i)
  130. elif c == '/' and source[i+1] == '/': # Find // comments.
  131. i = source.find('\n', i)
  132. if i == -1: # Handle EOF.
  133. i = end
  134. continue
  135. elif c == '/' and source[i+1] == '*': # Find /* comments. */
  136. i = source.find('*/', i) + 2
  137. continue
  138. elif c in ':+-<>&|*=': # : or :: (plus other chars).
  139. token_type = SYNTAX
  140. i += 1
  141. new_ch = source[i]
  142. if new_ch == c and c != '>': # Treat ">>" as two tokens.
  143. i += 1
  144. elif c == '-' and new_ch == '>':
  145. i += 1
  146. elif new_ch == '=':
  147. i += 1
  148. elif c in '()[]{}~!?^%;/.,': # Handle single char tokens.
  149. token_type = SYNTAX
  150. i += 1
  151. if c == '.' and source[i].isdigit():
  152. token_type = CONSTANT
  153. i += 1
  154. while source[i] in int_or_float_digits:
  155. i += 1
  156. # Handle float suffixes.
  157. for suffix in ('l', 'f'):
  158. if suffix == source[i:i+1].lower():
  159. i += 1
  160. break
  161. elif c.isdigit(): # Find integer.
  162. token_type = CONSTANT
  163. if c == '0' and source[i+1] in 'xX':
  164. # Handle hex digits.
  165. i += 2
  166. while source[i] in hex_digits:
  167. i += 1
  168. else:
  169. while source[i] in int_or_float_digits2:
  170. i += 1
  171. # Handle integer (and float) suffixes.
  172. for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
  173. size = len(suffix)
  174. if suffix == source[i:i+size].lower():
  175. i += size
  176. break
  177. elif c == '"': # Find string.
  178. token_type = CONSTANT
  179. i = _GetString(source, start, i)
  180. elif c == "'": # Find char.
  181. token_type = CONSTANT
  182. i = _GetChar(source, start, i)
  183. elif c == '#': # Find pre-processor command.
  184. token_type = PREPROCESSOR
  185. got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
  186. if got_if:
  187. count_ifs += 1
  188. elif source[i:i+6] == '#endif':
  189. count_ifs -= 1
  190. if count_ifs == 0:
  191. ignore_errors = False
  192. # TODO(nnorwitz): handle preprocessor statements (\ continuations).
  193. while 1:
  194. i1 = source.find('\n', i)
  195. i2 = source.find('//', i)
  196. i3 = source.find('/*', i)
  197. i4 = source.find('"', i)
  198. # NOTE(nnorwitz): doesn't handle comments in #define macros.
  199. # Get the first important symbol (newline, comment, EOF/end).
  200. i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
  201. # Handle #include "dir//foo.h" properly.
  202. if source[i] == '"':
  203. i = source.find('"', i+1) + 1
  204. assert i > 0
  205. continue
  206. # Keep going if end of the line and the line ends with \.
  207. if not (i == i1 and source[i-1] == '\\'):
  208. if got_if:
  209. condition = source[start+4:i].lstrip()
  210. if (condition.startswith('0') or
  211. condition.startswith('(0)')):
  212. ignore_errors = True
  213. break
  214. i += 1
  215. elif c == '\\': # Handle \ in code.
  216. # This is different from the pre-processor \ handling.
  217. i += 1
  218. continue
  219. elif ignore_errors:
  220. # The tokenizer seems to be in pretty good shape. This
  221. # raise is conditionally disabled so that bogus code
  222. # in an #if 0 block can be handled. Since we will ignore
  223. # it anyways, this is probably fine. So disable the
  224. # exception and return the bogus char.
  225. i += 1
  226. else:
  227. sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
  228. ('?', i, c, source[i-10:i+10]))
  229. raise RuntimeError('unexpected token')
  230. if i <= 0:
  231. print('Invalid index, exiting now.')
  232. return
  233. yield Token(token_type, source[start:i], start, i)
  234. if __name__ == '__main__':
  235. def main(argv):
  236. """Driver mostly for testing purposes."""
  237. for filename in argv[1:]:
  238. source = utils.ReadFile(filename)
  239. if source is None:
  240. continue
  241. for token in GetTokens(source):
  242. print('%-12s: %s' % (token.token_type, token.name))
  243. # print('\r%6.2f%%' % (100.0 * index / token.end),)
  244. sys.stdout.write('\n')
  245. main(sys.argv)