[thirdparty/gcc.git] / gcc / rust / lex / rust-lex.cc

// Copyright (C) 2020-2023 Free Software Foundation, Inc.

// This file is part of GCC.

// GCC is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 3, or (at your option) any later
// version.

// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.

// You should have received a copy of the GNU General Public License
// along with GCC; see the file COPYING3.  If not see
// <http://www.gnu.org/licenses/>.

#include "rust-system.h"
#include "rust-lex.h"
#include "rust-diagnostics.h"
#include "rust-linemap.h"
#include "rust-session-manager.h"
#include "safe-ctype.h"

namespace Rust {
// TODO: move to separate compilation unit?
// overload += for uint32_t to allow 32-bit encoded utf-8 to be added
std::string &
operator+= (std::string &str, Codepoint char32)
{
  if (char32.value < 0x80)
    {
      str += static_cast<char> (char32.value);
    }
  else if (char32.value < (0x1F + 1) << (1 * 6))
    {
      str += static_cast<char> (0xC0 | ((char32.value >> 6) & 0x1F));
      str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
    }
  else if (char32.value < (0x0F + 1) << (2 * 6))
    {
      str += static_cast<char> (0xE0 | ((char32.value >> 12) & 0x0F));
      str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
      str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
    }
  else if (char32.value < (0x07 + 1) << (3 * 6))
    {
      str += static_cast<char> (0xF0 | ((char32.value >> 18) & 0x07));
      str += static_cast<char> (0x80 | ((char32.value >> 12) & 0x3F));
      str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
      str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
    }
  else
    {
      rust_debug ("Invalid unicode codepoint found: '%u' ", char32.value);
    }
  return str;
}

std::string
Codepoint::as_string ()
{
  std::string str;

  // str += Codepoint (value);
  str += *this;

  return str;
}

/* Includes all allowable float digits EXCEPT _ and . as that needs lookahead
 * for handling. */
bool
is_float_digit (char number)
{
  return ISDIGIT (number) || number == 'E' || number == 'e';
}

/* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or
 * whatever is different */
bool
is_x_digit (char number)
{
  return ISXDIGIT (number);
}

bool
is_octal_digit (char number)
{
  return number >= '0' && number <= '7';
}

bool
is_bin_digit (char number)
{
  return number == '0' || number == '1';
}

bool
check_valid_float_dot_end (char character)
{
  return character != '.' && character != '_' && !ISALPHA (character);
}

// ISSPACE from safe-ctype but may change in future
bool
is_whitespace (char character)
{
  return ISSPACE (character);
}

bool
is_non_decimal_int_literal_separator (char character)
{
  return character == 'x' || character == 'o' || character == 'b';
}

Lexer::Lexer (const std::string &input)
  : input (RAIIFile::create_error ()), current_line (1), current_column (1),
    line_map (nullptr), raw_input_source (new BufferInputSource (input, 0)),
    input_queue{*raw_input_source}, token_queue (TokenSource (this))
{}

Lexer::Lexer (const char *filename, RAIIFile file_input, Linemap *linemap)
  : input (std::move (file_input)), current_line (1), current_column (1),
    line_map (linemap),
    raw_input_source (new FileInputSource (input.get_raw ())),
    input_queue{*raw_input_source}, token_queue (TokenSource (this))
{
  // inform line_table that file is being entered and is in line 1
  if (linemap)
    line_map->start_file (filename, current_line);
}

Lexer::~Lexer ()
{
  /* ok apparently stop (which is equivalent of original code in destructor) is
   * meant to be called after all files have finished parsing, for cleanup. On
   * the other hand, actual code that it calls to leave a certain line map is
   * mentioned in GCC docs as being useful for "just leaving an included header"
   * and stuff like that, so this line mapping functionality may need fixing.
   * FIXME: find out whether this occurs. */

  // line_map->stop();
}

/* TODO: need to optimise somehow to avoid the virtual function call in the
 * tight loop. Best idea at the moment is CRTP, but that might make lexer
 * implementation annoying when storing the "base class" (i.e. would need
 * template parameter everywhere), although in practice it would mostly just
 * look ugly and make enclosing classes like Parser also require a type
 * parameter. At this point a macro might be better. OK I guess macros can be
 * replaced by constexpr if or something if possible. */
Location
Lexer::get_current_location ()
{
  if (line_map)
    return line_map->get_location (current_column);
  else
    // If we have no linemap, we're lexing something without proper locations
    return Location ();
}

int
Lexer::peek_input (int n)
{
  return input_queue.peek (n);
}

int
Lexer::peek_input ()
{
  return peek_input (0);
}

void
Lexer::skip_input (int n)
{
  input_queue.skip (n);
}

void
Lexer::skip_input ()
{
  skip_input (0);
}

void
Lexer::replace_current_token (TokenPtr replacement)
{
  token_queue.replace_current_value (replacement);

  rust_debug ("called 'replace_current_token' - this is deprecated");
}

/* shitty anonymous namespace that can only be accessed inside the compilation
 * unit - used for classify_keyword binary search in sorted array of keywords
 * created with x-macros. */
namespace {
// TODO: make constexpr when update to c++20
const std::string keyword_index[] = {
#define RS_TOKEN(x, y)
#define RS_TOKEN_KEYWORD(name, keyword) keyword,
  RS_TOKEN_LIST
#undef RS_TOKEN_KEYWORD
#undef RS_TOKEN
};

constexpr TokenId keyword_keys[] = {
#define RS_TOKEN(x, y)
#define RS_TOKEN_KEYWORD(name, keyword) name,
  RS_TOKEN_LIST
#undef RS_TOKEN_KEYWORD
#undef RS_TOKEN
};

constexpr int num_keywords = sizeof (keyword_index) / sizeof (*keyword_index);
} // namespace

/* Determines whether the string passed in is a keyword or not. If it is, it
 * returns the keyword name.  */
TokenId
Lexer::classify_keyword (const std::string &str)
{
  const std::string *last = keyword_index + num_keywords;
  const std::string *idx = std::lower_bound (keyword_index, last, str);

  if (idx == last || str != *idx)
    return IDENTIFIER;

  // TODO: possibly replace this x-macro system with something like hash map?

  // We now have the expected token ID of the reserved keyword. However, some
  // keywords are reserved starting in certain editions. For example, `try` is
  // only a reserved keyword in editions >=2018. The language might gain new
  // reserved keywords in the future.
  //
  // https://doc.rust-lang.org/reference/keywords.html#reserved-keywords
  auto id = keyword_keys[idx - keyword_index];

  // `try` is not a reserved keyword before 2018
  if (Session::get_instance ().options.get_edition ()
	== CompileOptions::Edition::E2015
      && id == TRY)
    return IDENTIFIER;

  return id;
}

TokenPtr
Lexer::build_token ()
{
  // loop to go through multiple characters to build a single token
  while (true)
    {
      Location loc = get_current_location ();
      current_char = peek_input ();
      skip_input ();

      // detect UTF8 bom
      //
      // Must be the first thing on the first line.
      // There might be an optional BOM (Byte Order Mark), which for UTF-8 is
      // the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped.
      if (current_line == 1 && current_column == 1 && current_char == 0xef
	  && peek_input () == 0xbb && peek_input (1) == 0xbf)
	{
	  skip_input (1);
	  current_char = peek_input ();
	  skip_input ();
	}

      // detect shebang
      // Must be the first thing on the first line, starting with #!
      // But since an attribute can also start with an #! we don't count it as a
      // shebang line when after any whitespace or comments there is a [. If it
      // is a shebang line we simple drop the line. Otherwise we don't consume
      // any characters and fall through to the real tokenizer.
      if (current_line == 1 && current_column == 1 && current_char == '#'
	  && peek_input () == '!')
	{
	  int n = 1;
	  while (true)
	    {
	      int next_char = peek_input (n);
	      if (is_whitespace (next_char))
		n++;
	      else if ((next_char == '/' && peek_input (n + 1) == '/'
			&& peek_input (n + 2) != '!'
			&& peek_input (n + 2) != '/')
		       || (next_char == '/' && peek_input (n + 1) == '/'
			   && peek_input (n + 2) == '/'
			   && peek_input (n + 3) == '/'))
		{
		  // two // or four ////
		  // A single line comment
		  // (but not an inner or outer doc comment)
		  n += 2;
		  next_char = peek_input (n);
		  while (next_char != '\n' && next_char != EOF)
		    {
		      n++;
		      next_char = peek_input (n);
		    }
		  if (next_char == '\n')
		    n++;
		}
	      else if (next_char == '/' && peek_input (n + 1) == '*'
		       && peek_input (n + 2) == '*'
		       && peek_input (n + 3) == '/')
		{
		  /**/
		  n += 4;
		}
	      else if (next_char == '/' && peek_input (n + 1) == '*'
		       && peek_input (n + 2) == '*' && peek_input (n + 3) == '*'
		       && peek_input (n + 4) == '/')
		{
		  /***/
		  n += 5;
		}
	      else if ((next_char == '/' && peek_input (n + 1) == '*'
			&& peek_input (n + 2) != '*'
			&& peek_input (n + 2) != '!')
		       || (next_char == '/' && peek_input (n + 1) == '*'
			   && peek_input (n + 2) == '*'
			   && peek_input (n + 3) == '*'))
		{
		  // one /* or three /***
		  // Start of a block comment
		  // (but not an inner or outer doc comment)
		  n += 2;
		  int level = 1;
		  while (level > 0)
		    {
		      if (peek_input (n) == EOF)
			break;
		      else if (peek_input (n) == '/'
			       && peek_input (n + 1) == '*')
			{
			  n += 2;
			  level += 1;
			}
		      else if (peek_input (n) == '*'
			       && peek_input (n + 1) == '/')
			{
			  n += 2;
			  level -= 1;
			}
		      else
			n++;
		    }
		}
	      else if (next_char != '[')
		{
		  // definitely shebang, ignore the first line
		  while (current_char != '\n' && current_char != EOF)
		    {
		      current_char = peek_input ();
		      skip_input ();
		    }

		  // newline
		  current_line++;
		  current_column = 1;
		  // tell line_table that new line starts
		  start_line (current_line, max_column_hint);
		  break;
		}
	      else
		break; /* Definitely not a shebang line. */
	    }
	}

      // return end of file token if end of file
      if (current_char == EOF)
	return Token::make (END_OF_FILE, loc);

      // if not end of file, start tokenising
      switch (current_char)
	{
	/* ignore whitespace characters for tokens but continue updating
	 * location */
	case '\n': // newline
	  current_line++;
	  current_column = 1;
	  // tell line_table that new line starts
	  start_line (current_line, max_column_hint);
	  continue;
	case '\r': // cr
	  // Ignore, we expect a newline (lf) soon.
	  continue;
	case ' ': // space
	  current_column++;
	  continue;
	case '\t': // tab
	  // width of a tab is not well-defined, assume 8 spaces
	  current_column += 8;
	  continue;

	// punctuation - actual tokens
	case '=':
	  if (peek_input () == '>')
	    {
	      // match arm arrow
	      skip_input ();
	      current_column += 2;

	      return Token::make (MATCH_ARROW, loc);
	    }
	  else if (peek_input () == '=')
	    {
	      // equality operator
	      skip_input ();
	      current_column += 2;

	      return Token::make (EQUAL_EQUAL, loc);
	    }
	  else
	    {
	      // assignment operator
	      current_column++;
	      return Token::make (EQUAL, loc);
	    }
	case '(':
	  current_column++;
	  return Token::make (LEFT_PAREN, loc);
	case '-':
	  if (peek_input () == '>')
	    {
	      // return type specifier
	      skip_input ();
	      current_column += 2;

	      return Token::make (RETURN_TYPE, loc);
	    }
	  else if (peek_input () == '=')
	    {
	      // minus-assign
	      skip_input ();
	      current_column += 2;

	      return Token::make (MINUS_EQ, loc);
	    }
	  else
	    {
	      // minus
	      current_column++;
	      return Token::make (MINUS, loc);
	    }
	case '+':
	  if (peek_input () == '=')
	    {
	      // add-assign
	      skip_input ();
	      current_column += 2;

	      return Token::make (PLUS_EQ, loc);
	    }
	  else
	    {
	      // add
	      current_column++;
	      return Token::make (PLUS, loc);
	    }
	case ')':
	  current_column++;
	  return Token::make (RIGHT_PAREN, loc);
	case ';':
	  current_column++;
	  return Token::make (SEMICOLON, loc);
	case '*':
	  if (peek_input () == '=')
	    {
	      // multiplication-assign
	      skip_input ();
	      current_column += 2;

	      return Token::make (ASTERISK_EQ, loc);
	    }
	  else
	    {
	      // multiplication
	      current_column++;
	      return Token::make (ASTERISK, loc);
	    }
	case ',':
	  current_column++;
	  return Token::make (COMMA, loc);
	case '/':
	  if (peek_input () == '=')
	    {
	      // division-assign
	      skip_input ();
	      current_column += 2;

	      return Token::make (DIV_EQ, loc);
	    }
	  else if ((peek_input () == '/' && peek_input (1) != '!'
		    && peek_input (1) != '/')
		   || (peek_input () == '/' && peek_input (1) == '/'
		       && peek_input (2) == '/'))
	    {
	      // two // or four ////
	      // single line comment
	      // (but not an inner or outer doc comment)
	      skip_input ();
	      current_column += 2;
	      current_char = peek_input ();

	      // basically ignore until line finishes
	      while (current_char != '\n' && current_char != EOF)
		{
		  skip_input ();
		  current_column++; // not used
		  current_char = peek_input ();
		}
	      continue;
	    }
	  else if (peek_input () == '/'
		   && (peek_input (1) == '!' || peek_input (1) == '/'))
	    {
	      /* single line doc comment, inner or outer.  */
	      bool is_inner = peek_input (1) == '!';
	      skip_input (1);
	      current_column += 3;

	      std::string str;
	      str.reserve (32);
	      current_char = peek_input ();
	      while (current_char != '\n')
		{
		  skip_input ();
		  if (current_char == '\r')
		    {
		      char next_char = peek_input ();
		      if (next_char == '\n')
			{
			  current_char = '\n';
			  break;
			}
		      rust_error_at (
			loc, "Isolated CR %<\\r%> not allowed in doc comment");
		      current_char = next_char;
		      continue;
		    }
		  if (current_char == EOF)
		    {
		      rust_error_at (
			loc, "unexpected EOF while looking for end of comment");
		      break;
		    }
		  str += current_char;
		  current_char = peek_input ();
		}
	      skip_input ();
	      current_line++;
	      current_column = 1;
	      // tell line_table that new line starts
	      start_line (current_line, max_column_hint);

	      str.shrink_to_fit ();
	      if (is_inner)
		return Token::make_inner_doc_comment (loc, std::move (str));
	      else
		return Token::make_outer_doc_comment (loc, std::move (str));
	    }
	  else if (peek_input () == '*' && peek_input (1) == '*'
		   && peek_input (2) == '/')
	    {
	      /**/
	      skip_input (2);
	      current_column += 4;
	      continue;
	    }
	  else if (peek_input () == '*' && peek_input (1) == '*'
		   && peek_input (2) == '*' && peek_input (3) == '/')
	    {
	      /***/
	      skip_input (3);
	      current_column += 5;
	      continue;
	    }
	  else if ((peek_input () == '*' && peek_input (1) != '!'
		    && peek_input (1) != '*')
		   || (peek_input () == '*' && peek_input (1) == '*'
		       && peek_input (2) == '*'))
	    {
	      // one /* or three /***
	      // block comment
	      // (but not an inner or outer doc comment)
	      skip_input ();
	      current_column += 2;

	      int level = 1;
	      while (level > 0)
		{
		  current_char = peek_input ();

		  if (current_char == EOF)
		    {
		      rust_error_at (
			loc, "unexpected EOF while looking for end of comment");
		      break;
		    }

		  // if /* found
		  if (current_char == '/' && peek_input (1) == '*')
		    {
		      // skip /* characters
		      skip_input (1);

		      current_column += 2;

		      level += 1;
		      continue;
		    }

		  // ignore until */ is found
		  if (current_char == '*' && peek_input (1) == '/')
		    {
		      // skip */ characters
		      skip_input (1);

		      current_column += 2;

		      level -= 1;
		      continue;
		    }

		  if (current_char == '\n')
		    {
		      skip_input ();
		      current_line++;
		      current_column = 1;
		      // tell line_table that new line starts
		      start_line (current_line, max_column_hint);
		      continue;
		    }

		  skip_input ();
		  current_column++;
		}

	      // refresh new token
	      continue;
	    }
	  else if (peek_input () == '*'
		   && (peek_input (1) == '!' || peek_input (1) == '*'))
	    {
	      // block doc comment, inner /*! or outer /**
	      bool is_inner = peek_input (1) == '!';
	      skip_input (1);
	      current_column += 3;

	      std::string str;
	      str.reserve (96);

	      int level = 1;
	      while (level > 0)
		{
		  current_char = peek_input ();

		  if (current_char == EOF)
		    {
		      rust_error_at (
			loc, "unexpected EOF while looking for end of comment");
		      break;
		    }

		  // if /* found
		  if (current_char == '/' && peek_input (1) == '*')
		    {
		      // skip /* characters
		      skip_input (1);
		      current_column += 2;

		      level += 1;
		      str += "/*";
		      continue;
		    }

		  // ignore until */ is found
		  if (current_char == '*' && peek_input (1) == '/')
		    {
		      // skip */ characters
		      skip_input (1);
		      current_column += 2;

		      level -= 1;
		      if (level > 0)
			str += "*/";
		      continue;
		    }

		  if (current_char == '\r' && peek_input (1) != '\n')
		    rust_error_at (
		      loc, "Isolated CR %<\\r%> not allowed in doc comment");

		  if (current_char == '\n')
		    {
		      skip_input ();
		      current_line++;
		      current_column = 1;
		      // tell line_table that new line starts
		      start_line (current_line, max_column_hint);
		      str += '\n';
		      continue;
		    }

		  str += current_char;
		  skip_input ();
		  current_column++;
		}

	      str.shrink_to_fit ();
	      if (is_inner)
		return Token::make_inner_doc_comment (loc, std::move (str));
	      else
		return Token::make_outer_doc_comment (loc, std::move (str));
	    }
	  else
	    {
	      // division
	      current_column++;
	      return Token::make (DIV, loc);
	    }
	case '%':
	  if (peek_input () == '=')
	    {
	      // modulo-assign
	      skip_input ();
	      current_column += 2;

	      return Token::make (PERCENT_EQ, loc);
	    }
	  else
	    {
	      // modulo
	      current_column++;
	      return Token::make (PERCENT, loc);
	    }
	case '^':
	  if (peek_input () == '=')
	    {
	      // xor-assign?
	      skip_input ();
	      current_column += 2;

	      return Token::make (CARET_EQ, loc);
	    }
	  else
	    {
	      // xor?
	      current_column++;
	      return Token::make (CARET, loc);
	    }
	case '<':
	  if (peek_input () == '<')
	    {
	      if (peek_input (1) == '=')
		{
		  // left-shift assign
		  skip_input (1);
		  current_column += 3;

		  return Token::make (LEFT_SHIFT_EQ, loc);
		}
	      else
		{
		  // left-shift
		  skip_input ();
		  current_column += 2;

		  return Token::make (LEFT_SHIFT, loc);
		}
	    }
	  else if (peek_input () == '=')
	    {
	      // smaller than or equal to
	      skip_input ();
	      current_column += 2;

	      return Token::make (LESS_OR_EQUAL, loc);
	    }
	  else
	    {
	      // smaller than
	      current_column++;
	      return Token::make (LEFT_ANGLE, loc);
	    }
	  break;
	case '>':
	  if (peek_input () == '>')
	    {
	      if (peek_input (1) == '=')
		{
		  // right-shift-assign
		  skip_input (1);
		  current_column += 3;

		  return Token::make (RIGHT_SHIFT_EQ, loc);
		}
	      else
		{
		  // right-shift
		  skip_input ();
		  current_column += 2;

		  return Token::make (RIGHT_SHIFT, loc);
		}
	    }
	  else if (peek_input () == '=')
	    {
	      // larger than or equal to
	      skip_input ();
	      current_column += 2;

	      return Token::make (GREATER_OR_EQUAL, loc);
	    }
	  else
	    {
	      // larger than
	      current_column++;
	      return Token::make (RIGHT_ANGLE, loc);
	    }
	case ':':
	  if (peek_input () == ':')
	    {
	      // scope resolution ::
	      skip_input ();
	      current_column += 2;

	      return Token::make (SCOPE_RESOLUTION, loc);
	    }
	  else
	    {
	      // single colon :
	      current_column++;
	      return Token::make (COLON, loc);
	    }
	case '!':
	  // no special handling for macros in lexer?
	  if (peek_input () == '=')
	    {
	      // not equal boolean operator
	      skip_input ();
	      current_column += 2;

	      return Token::make (NOT_EQUAL, loc);
	    }
	  else
	    {
	      // not equal unary operator
	      current_column++;

	      return Token::make (EXCLAM, loc);
	    }
	case '?':
	  current_column++;
	  return Token::make (QUESTION_MARK, loc);
	case '#':
	  current_column++;
	  return Token::make (HASH, loc);
	case '[':
	  current_column++;
	  return Token::make (LEFT_SQUARE, loc);
	case ']':
	  current_column++;
	  return Token::make (RIGHT_SQUARE, loc);
	case '{':
	  current_column++;
	  return Token::make (LEFT_CURLY, loc);
	case '}':
	  current_column++;
	  return Token::make (RIGHT_CURLY, loc);
	case '@':
	  current_column++;
	  return Token::make (PATTERN_BIND, loc);
	case '$':
	  current_column++;
	  return Token::make (DOLLAR_SIGN, loc);
	case '~':
	  current_column++;
	  return Token::make (TILDE, loc);
	case '\\':
	  current_column++;
	  return Token::make (BACKSLASH, loc);
	case '`':
	  current_column++;
	  return Token::make (BACKTICK, loc);
	case '|':
	  if (peek_input () == '=')
	    {
	      // bitwise or-assign?
	      skip_input ();
	      current_column += 2;

	      return Token::make (PIPE_EQ, loc);
	    }
	  else if (peek_input () == '|')
	    {
	      // logical or
	      skip_input ();
	      current_column += 2;

	      return Token::make (OR, loc);
	    }
	  else
	    {
	      // bitwise or
	      current_column++;

	      return Token::make (PIPE, loc);
	    }
	case '&':
	  if (peek_input () == '=')
	    {
	      // bitwise and-assign?
	      skip_input ();
	      current_column += 2;

	      return Token::make (AMP_EQ, loc);
	    }
	  else if (peek_input () == '&')
	    {
	      // logical and
	      skip_input ();
	      current_column += 2;

	      return Token::make (LOGICAL_AND, loc);
	    }
	  else
	    {
	      // bitwise and/reference
	      current_column++;

	      return Token::make (AMP, loc);
	    }
	case '.':
	  if (peek_input () == '.')
	    {
	      if (peek_input (1) == '.')
		{
		  // ellipsis
		  skip_input (1);
		  current_column += 3;

		  return Token::make (ELLIPSIS, loc);
		}
	      else if (peek_input (1) == '=')
		{
		  // ..=
		  skip_input (1);
		  current_column += 3;

		  return Token::make (DOT_DOT_EQ, loc);
		}
	      else
		{
		  // ..
		  skip_input ();
		  current_column += 2;

		  return Token::make (DOT_DOT, loc);
		}
	    }
	  else /*if (!ISDIGIT (peek_input ()))*/
	    {
	      // single dot .
	      // Only if followed by a non-number - otherwise is float
	      // nope, float cannot start with '.'.
	      current_column++;
	      return Token::make (DOT, loc);
	    }
	}
      // TODO: special handling of _ in the lexer? instead of being identifier

      // byte character, byte string and raw byte string literals
      if (current_char == 'b')
	{
	  if (peek_input () == '\'')
	    return parse_byte_char (loc);
	  else if (peek_input () == '"')
	    return parse_byte_string (loc);
	  else if (peek_input () == 'r'
		   && (peek_input (1) == '#' || peek_input (1) == '"'))
	    return parse_raw_byte_string (loc);
	}

      // raw identifiers and raw strings
      if (current_char == 'r')
	{
	  int peek = peek_input ();
	  int peek1 = peek_input (1);

	  if (peek == '#' && (ISALPHA (peek1) || peek1 == '_'))
	    {
	      TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
	      if (raw_ident_ptr != nullptr)
		return raw_ident_ptr;
	      else
		continue; /* input got parsed, it just wasn't valid. An error
			     was produced. */
	    }
	  else
	    {
	      TokenPtr maybe_raw_string_ptr = maybe_parse_raw_string (loc);
	      if (maybe_raw_string_ptr != nullptr)
		return maybe_raw_string_ptr;
	    }
	}

      // find identifiers and keywords
      if (ISALPHA (current_char) || current_char == '_')
	return parse_identifier_or_keyword (loc);

      // int and float literals
      if (ISDIGIT (current_char))
	{ //  _ not allowed as first char
	  if (current_char == '0'
	      && is_non_decimal_int_literal_separator (peek_input ()))
	    {
	      // handle binary, octal, hex literals
	      TokenPtr non_dec_int_lit_ptr
		= parse_non_decimal_int_literals (loc);
	      if (non_dec_int_lit_ptr != nullptr)
		return non_dec_int_lit_ptr;
	    }
	  else
	    {
	      // handle decimals (integer or float)
	      TokenPtr decimal_or_float_ptr = parse_decimal_int_or_float (loc);
	      if (decimal_or_float_ptr != nullptr)
		return decimal_or_float_ptr;
	    }
	}

      // string literals
      if (current_char == '"')
	return parse_string (loc);

      // char literals and lifetime names
      if (current_char == '\'')
	{
	  TokenPtr char_or_lifetime_ptr = parse_char_or_lifetime (loc);
	  if (char_or_lifetime_ptr != nullptr)
	    return char_or_lifetime_ptr;
	}

      // DEBUG: check for specific character problems:
      if (current_char == '0')
	rust_debug ("'0' uncaught before unexpected character");
      else if (current_char == ']')
	rust_debug ("']' uncaught before unexpected character");
      else if (current_char == 0x5d)
	rust_debug ("whatever 0x5d is (not '0' or ']') uncaught before "
		    "unexpected character");

      // didn't match anything so error
      rust_error_at (loc, "unexpected character %<%x%>", current_char);
      current_column++;
    }
}

// Parses in a type suffix.
std::pair<PrimitiveCoreType, int>
Lexer::parse_in_type_suffix ()
{
  std::string suffix;
  suffix.reserve (5);

  int additional_length_offset = 0;

  // get suffix
  while (ISALPHA (current_char) || ISDIGIT (current_char)
	 || current_char == '_')
    {
      if (current_char == '_')
	{
	  // don't add _ to suffix
	  skip_input ();
	  current_char = peek_input ();

	  additional_length_offset++;

	  continue;
	}

      additional_length_offset++;

      suffix += current_char;
      skip_input ();
      current_char = peek_input ();
    }

  if (suffix.empty ())
    {
      // no type suffix: do nothing but also no error
      return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
    }
  else if (suffix == "f32")
    {
      return std::make_pair (CORETYPE_F32, additional_length_offset);
    }
  else if (suffix == "f64")
    {
      return std::make_pair (CORETYPE_F64, additional_length_offset);
    }
  else if (suffix == "i8")
    {
      return std::make_pair (CORETYPE_I8, additional_length_offset);
    }
  else if (suffix == "i16")
    {
      return std::make_pair (CORETYPE_I16, additional_length_offset);
    }
  else if (suffix == "i32")
    {
      return std::make_pair (CORETYPE_I32, additional_length_offset);
    }
  else if (suffix == "i64")
    {
      return std::make_pair (CORETYPE_I64, additional_length_offset);
    }
  else if (suffix == "i128")
    {
      return std::make_pair (CORETYPE_I128, additional_length_offset);
    }
  else if (suffix == "isize")
    {
      return std::make_pair (CORETYPE_ISIZE, additional_length_offset);
    }
  else if (suffix == "u8")
    {
      return std::make_pair (CORETYPE_U8, additional_length_offset);
    }
  else if (suffix == "u16")
    {
      return std::make_pair (CORETYPE_U16, additional_length_offset);
    }
  else if (suffix == "u32")
    {
      return std::make_pair (CORETYPE_U32, additional_length_offset);
    }
  else if (suffix == "u64")
    {
      return std::make_pair (CORETYPE_U64, additional_length_offset);
    }
  else if (suffix == "u128")
    {
      return std::make_pair (CORETYPE_U128, additional_length_offset);
    }
  else if (suffix == "usize")
    {
      return std::make_pair (CORETYPE_USIZE, additional_length_offset);
    }
  else
    {
      rust_error_at (get_current_location (), "unknown number suffix %qs",
		     suffix.c_str ());

      return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
    }
}

// Parses in the exponent part (if any) of a float literal.
std::pair<std::string, int>
Lexer::parse_in_exponent_part ()
{
  int additional_length_offset = 0;
  std::string str;
  if (current_char == 'E' || current_char == 'e')
    {
      // add exponent to string as strtod works with it
      str += current_char;
      skip_input ();
      current_char = peek_input ();

      additional_length_offset++;

      // special - and + handling
      if (current_char == '-')
	{
	  str += '-';

	  skip_input ();
	  current_char = peek_input ();

	  additional_length_offset++;
	}
      else if (current_char == '+')
	{
	  // don't add + but still skip input
	  skip_input ();
	  current_char = peek_input ();

	  additional_length_offset++;
	}

      // parse another decimal number for exponent
      auto str_length = parse_in_decimal ();
      str += std::get<0> (str_length);
      additional_length_offset += std::get<1> (str_length);
    }
  return std::make_pair (str, additional_length_offset);
}

// Parses a decimal integer.
std::tuple<std::string, int, bool>
Lexer::parse_in_decimal ()
{
  /* A pure decimal contains only digits.  */
  bool pure_decimal = true;
  int additional_length_offset = 0;
  std::string str;
  while (ISDIGIT (current_char) || current_char == '_')
    {
      if (current_char == '_')
	{
	  pure_decimal = false;
	  // don't add _ to number
	  skip_input ();
	  current_char = peek_input ();

	  additional_length_offset++;

	  continue;
	}

      additional_length_offset++;

      str += current_char;
      skip_input ();
      current_char = peek_input ();
    }
  return std::make_tuple (str, additional_length_offset, pure_decimal);
}

/* Parses escapes (and string continues) in "byte" strings and characters. Does
 * not support unicode. */
std::tuple<char, int, bool>
Lexer::parse_escape (char opening_char)
{
  int additional_length_offset = 0;
  char output_char = 0;

  // skip to actual letter
  skip_input ();
  current_char = peek_input ();
  additional_length_offset++;

  switch (current_char)
    {
      case 'x': {
	auto hex_escape_pair = parse_partial_hex_escape ();
	long hexLong = hex_escape_pair.first;
	additional_length_offset += hex_escape_pair.second;

	if (hexLong > 255 || hexLong < 0)
	  rust_error_at (
	    get_current_location (),
	    "byte \\x escape %<\\x%x%> out of range - allows up to %<\\xFF%>",
	    static_cast<unsigned int> (hexLong));
	/* TODO: restore capital for escape output - gcc pretty-printer doesn't
	 * support %X directly */
	char hexChar = static_cast<char> (hexLong);

	output_char = hexChar;
      }
      break;
    case 'n':
      output_char = '\n';
      break;
    case 'r':
      output_char = '\r';
      break;
    case 't':
      output_char = '\t';
      break;
    case '\\':
      output_char = '\\';
      break;
    case '0':
      output_char = '\0';
      break;
    case '\'':
      output_char = '\'';
      break;
    case '"':
      output_char = '"';
      break;
    case 'u':
      rust_error_at (get_current_location (),
		     "cannot have a unicode escape \\u in a byte %s",
		     opening_char == '\'' ? "character" : "string");
      // Try to parse it anyway, just to skip it
      parse_partial_unicode_escape ();
      return std::make_tuple (output_char, additional_length_offset, false);
    case '\r':
    case '\n':
      // string continue
      return std::make_tuple (0, parse_partial_string_continue (), true);
    default:
      rust_error_at (get_current_location (),
		     "unknown escape sequence %<\\%c%>", current_char);
      // returns false if no parsing could be done
      // return false;
      return std::make_tuple (output_char, additional_length_offset, false);
      break;
    }
  // all non-special cases (string continue) should skip their used char
  skip_input ();
  current_char = peek_input ();
  additional_length_offset++;

  // returns true if parsing was successful
  // return true;
  return std::make_tuple (output_char, additional_length_offset, false);
}

/* Parses an escape (or string continue) in a string or character. Supports
 * unicode escapes. */
std::tuple<Codepoint, int, bool>
Lexer::parse_utf8_escape (char opening_char)
{
  Codepoint output_char;
  int additional_length_offset = 0;

  // skip to actual letter
  skip_input ();
  current_char = peek_input ();
  additional_length_offset++;

  switch (current_char)
    {
      case 'x': {
	auto hex_escape_pair = parse_partial_hex_escape ();
	long hexLong = hex_escape_pair.first;
	additional_length_offset += hex_escape_pair.second;

	if (hexLong > 127 || hexLong < 0)
	  rust_error_at (
	    get_current_location (),
	    "ascii \\x escape %<\\x%x%> out of range - allows up to %<\\x7F%>",
	    static_cast<unsigned int> (hexLong));
	/* TODO: restore capital for escape output - gcc pretty-printer doesn't
	 * support %X directly */
	char hexChar = static_cast<char> (hexLong);

	output_char = hexChar;
      }
      break;
    case 'n':
      output_char = '\n';
      break;
    case 'r':
      output_char = '\r';
      break;
    case 't':
      output_char = '\t';
      break;
    case '\\':
      output_char = '\\';
      break;
    case '0':
      output_char = '\0';
      break;
    case '\'':
      output_char = '\'';
      break;
    case '"':
      output_char = '"';
      break;
      case 'u': {
	auto unicode_escape_pair = parse_partial_unicode_escape ();
	output_char = unicode_escape_pair.first;
	additional_length_offset += unicode_escape_pair.second;

	return std::make_tuple (output_char, additional_length_offset, false);
      }
      break;
    case '\r':
    case '\n':
      // string continue
      return std::make_tuple (0, parse_partial_string_continue (), true);
    default:
      rust_error_at (get_current_location (),
		     "unknown escape sequence %<\\%c%>", current_char);
      // returns false if no parsing could be done
      // return false;
      return std::make_tuple (output_char, additional_length_offset, false);
      break;
    }
  /* all non-special cases (unicode, string continue) should skip their used
   * char */
  skip_input ();
  current_char = peek_input ();
  additional_length_offset++;

  // returns true if parsing was successful
  // return true;
  return std::make_tuple (output_char, additional_length_offset, false);
}

// Parses the body of a string continue that has been found in an escape.
int
Lexer::parse_partial_string_continue ()
{
  int additional_length_offset = 1;

  // string continue
  while (is_whitespace (current_char))
    {
      if (current_char == '\n')
	{
	  current_line++;
	  current_column = 1;
	  // tell line_table that new line starts
	  start_line (current_line, max_column_hint);

	  // reset "length"
	  additional_length_offset = 1;

	  // get next char
	  skip_input ();
	  current_char = peek_input ();

	  continue;
	}

      skip_input ();
      current_char = peek_input ();
      additional_length_offset++;
    }

  return additional_length_offset;
}

/* Parses the body of a '\x' escape. Note that it does not check that the number
 * is valid and smaller than 255. */
std::pair<long, int>
Lexer::parse_partial_hex_escape ()
{
  // hex char string (null-terminated)
  char hexNum[3] = {0, 0, 0};

  // first hex char
  current_char = peek_input (1);
  int additional_length_offset = 1;

  if (!is_x_digit (current_char))
    {
      rust_error_at (get_current_location (),
		     "invalid character %<\\x%c%> in \\x sequence",
		     current_char);
      return std::make_pair (0, 0);
    }
  hexNum[0] = current_char;

  // second hex char
  skip_input ();
  current_char = peek_input (1);
  additional_length_offset++;

  if (!is_x_digit (current_char))
    {
      rust_error_at (get_current_location (),
		     "invalid character %<\\x%c%c%> in \\x sequence", hexNum[0],
		     current_char);
      return std::make_pair (0, 1);
    }
  skip_input ();
  hexNum[1] = current_char;

  long hexLong = std::strtol (hexNum, nullptr, 16);

  return std::make_pair (hexLong, additional_length_offset);
}

// Parses the body of a unicode escape.
std::pair<Codepoint, int>
Lexer::parse_partial_unicode_escape ()
{
  skip_input ();
  current_char = peek_input ();
  int additional_length_offset = 0;

  if (current_char != '{')
    {
      rust_error_at (get_current_location (),
		     "unicode escape should start with %<{%>");
      /* Skip what should probaby have been between brackets.  */
      while (is_x_digit (current_char) || current_char == '_')
	{
	  skip_input ();
	  current_char = peek_input ();
	  additional_length_offset++;
	}
      return std::make_pair (Codepoint (0), additional_length_offset);
    }

  skip_input ();
  current_char = peek_input ();
  additional_length_offset++;

  if (current_char == '_')
    {
      rust_error_at (get_current_location (),
		     "unicode escape cannot start with %<_%>");
      skip_input ();
      current_char = peek_input ();
      additional_length_offset++;
      // fallthrough and try to parse the rest anyway
    }

  // parse unicode escape - 1-6 hex digits
  std::string num_str;
  num_str.reserve (6);

  // loop through to add entire hex number to string
  while (is_x_digit (current_char) || current_char == '_')
    {
      if (current_char == '_')
	{
	  // don't add _ to number
	  skip_input ();
	  current_char = peek_input ();

	  additional_length_offset++;

	  continue;
	}

      additional_length_offset++;

      // add raw hex numbers
      num_str += current_char;

      skip_input ();
      current_char = peek_input ();
    }

  if (current_char == '}')
    {
      skip_input ();
      current_char = peek_input ();
      additional_length_offset++;
    }
  else
    {
      // actually an error, but allow propagation anyway Assume that
      // wrong bracketm whitespace or single/double quotes are wrong
      // termination, otherwise it is a wrong character, then skip to the actual
      // terminator.
      if (current_char == '{' || is_whitespace (current_char)
	  || current_char == '\'' || current_char == '"')
	{
	  rust_error_at (get_current_location (),
			 "expected terminating %<}%> in unicode escape");
	  return std::make_pair (Codepoint (0), additional_length_offset);
	}
      else
	{
	  rust_error_at (get_current_location (),
			 "invalid character %<%c%> in unicode escape",
			 current_char);
	  while (current_char != '}' && current_char != '{'
		 && !is_whitespace (current_char) && current_char != '\''
		 && current_char != '"')
	    {
	      skip_input ();
	      current_char = peek_input ();
	      additional_length_offset++;
	    }
	  // Consume the actual closing bracket if found
	  if (current_char == '}')
	    {
	      skip_input ();
	      current_char = peek_input ();
	      additional_length_offset++;
	    }
	  return std::make_pair (Codepoint (0), additional_length_offset);
	}
    }

  // ensure 1-6 hex characters
  if (num_str.length () > 6 || num_str.length () < 1)
    {
      rust_error_at (get_current_location (),
		     "unicode escape should be between 1 and 6 hex "
		     "characters; it is %lu",
		     (unsigned long) num_str.length ());
      // return false;
      return std::make_pair (Codepoint (0), additional_length_offset);
    }

  unsigned long hex_num = std::strtoul (num_str.c_str (), nullptr, 16);

  if (hex_num > 0xd7ff && hex_num < 0xe000)
    {
      rust_error_at (
	get_current_location (),
	"unicode escape cannot be a surrogate value (D800 to DFFF)");
      return std::make_pair (Codepoint (0), additional_length_offset);
    }

  if (hex_num > 0x10ffff)
    {
      rust_error_at (get_current_location (),
		     "unicode escape cannot be larger than 10FFFF");
      return std::make_pair (Codepoint (0), additional_length_offset);
    }

  // return true;
  return std::make_pair (Codepoint (static_cast<uint32_t> (hex_num)),
			 additional_length_offset);
}

// Parses a byte character.
TokenPtr
Lexer::parse_byte_char (Location loc)
{
  skip_input ();
  current_column++;
  // make current char the next character
  current_char = peek_input ();

  int length = 1;

  // char to save
  char byte_char = 0;

  // detect escapes
  if (current_char == '\\')
    {
      auto escape_length_pair = parse_escape ('\'');
      byte_char = std::get<0> (escape_length_pair);
      length += std::get<1> (escape_length_pair);

      current_char = peek_input ();

      if (current_char != '\'')
	{
	  rust_error_at (get_current_location (), "unclosed %<byte char%>");
	}

      skip_input ();
      current_char = peek_input ();
      length++; // go to next char
    }
  else if (current_char != '\'')
    {
      // otherwise, get character from direct input character
      byte_char = current_char;

      skip_input ();
      current_char = peek_input ();
      length++;

      if (current_char != '\'')
	{
	  rust_error_at (get_current_location (), "unclosed %<byte char%>");
	}

      skip_input ();
      current_char = peek_input ();
      length++; // go to next char
    }
  else
    {
      rust_error_at (get_current_location (),
		     "no character inside %<%> for %<byte char%>");
    }

  current_column += length;

  return Token::make_byte_char (loc, byte_char);
}

// Parses a byte string.
TokenPtr
Lexer::parse_byte_string (Location loc)
{
  // byte string

  // skip quote character
  skip_input ();
  current_column++;

  std::string str;
  str.reserve (16); // some sensible default

  int length = 1;
  current_char = peek_input ();

  while (current_char != '"' && current_char != EOF)
    {
      if (current_char == '\\')
	{
	  auto escape_length_pair = parse_escape ('"');
	  char output_char = std::get<0> (escape_length_pair);

	  if (output_char == 0 && std::get<2> (escape_length_pair))
	    length = std::get<1> (escape_length_pair) - 1;
	  else
	    length += std::get<1> (escape_length_pair);

	  if (output_char != 0 || !std::get<2> (escape_length_pair))
	    str += output_char;

	  continue;
	}

      length++;

      str += current_char;
      skip_input ();
      current_char = peek_input ();
    }

  current_column += length;

  if (current_char == '"')
    {
      current_column++;

      skip_input ();
      current_char = peek_input ();
    }
  else if (current_char == EOF)
    {
      rust_error_at (get_current_location (), "unended byte string literal");
      return Token::make (END_OF_FILE, get_current_location ());
    }
  else
    {
      gcc_unreachable ();
    }

  str.shrink_to_fit ();

  return Token::make_byte_string (loc, std::move (str));
}

// Parses a raw byte string.
TokenPtr
Lexer::parse_raw_byte_string (Location loc)
{
  // raw byte string literals
  std::string str;
  str.reserve (16); // some sensible default

  int length = 1;
  int hash_count = 0;

  // get hash count at beginnning
  skip_input ();
  current_char = peek_input ();
  length++;
  while (current_char == '#')
    {
      hash_count++;
      length++;

      skip_input ();
      current_char = peek_input ();
    }

  if (current_char != '"')
    {
      rust_error_at (get_current_location (),
		     "raw byte string has no opening %<\"%>");
    }

  skip_input ();
  current_char = peek_input ();
  length++;

  while (true)
    {
      if (current_char == '"')
	{
	  bool enough_hashes = true;

	  for (int i = 0; i < hash_count; i++)
	    {
	      if (peek_input (i + 1) != '#')
		{
		  enough_hashes = false;
		  break;
		}
	    }

	  if (enough_hashes)
	    {
	      // skip enough input and peek enough input
	      skip_input (hash_count);
	      current_char = peek_input ();
	      length += hash_count + 1;
	      break;
	    }
	}

      if ((unsigned char) current_char > 127)
	{
	  rust_error_at (get_current_location (),
			 "character %<%c%> in raw byte string out of range",
			 current_char);
	  current_char = 0;
	}

      length++;

      str += current_char;
      skip_input ();
      current_char = peek_input ();
    }

  current_column += length;

  str.shrink_to_fit ();

  return Token::make_byte_string (loc, std::move (str));
}

// Parses a raw identifier.
TokenPtr
Lexer::parse_raw_identifier (Location loc)
{
  // raw identifier
  std::string str;
  str.reserve (16); // default

  skip_input ();
  current_char = peek_input ();

  current_column += 2;

  bool first_is_underscore = current_char == '_';

  int length = 0;
  current_char = peek_input ();
  // loop through entire name
  while (ISALPHA (current_char) || ISDIGIT (current_char)
	 || current_char == '_')
    {
      length++;

      str += current_char;
      skip_input ();
      current_char = peek_input ();
    }

  current_column += length;

  // if just a single underscore, not an identifier
  if (first_is_underscore && length == 1)
    rust_error_at (get_current_location (),
		   "%<_%> is not a valid raw identifier");

  if (str == "crate" || str == "extern" || str == "self" || str == "super"
      || str == "Self")
    {
      rust_error_at (get_current_location (),
		     "%qs is a forbidden raw identifier", str.c_str ());

      return nullptr;
    }
  else
    {
      str.shrink_to_fit ();

      return Token::make_identifier (loc, std::move (str));
    }
}

// skip broken string input (unterminated strings)
void
Lexer::skip_broken_string_input (int current_char)
{
  while (current_char != '"' && current_char != EOF)
    {
      if (current_char == '\n')
	{
	  current_line++;
	  current_column = 1;
	}
      else
	{
	  current_column++;
	}
      skip_input ();
      current_char = peek_input ();
    }
  if (current_char == '"')
    {
      current_column++;

      skip_input ();
      current_char = peek_input ();
    }
  rust_debug ("skipped to %d:%d due to bad quotes", current_line,
	      current_column);
}

// Parses a unicode string.
TokenPtr
Lexer::parse_string (Location loc)
{
  Codepoint current_char32;

  std::string str;
  str.reserve (16); // some sensible default

  int length = 1;
  current_char32 = peek_codepoint_input ();

  // FIXME: This fails if the input ends. How do we check for EOF?
  while (current_char32.value != '"' && !current_char32.is_eof ())
    {
      if (current_char32.value == '\\')
	{
	  // parse escape
	  auto utf8_escape_pair = parse_utf8_escape ('\'');
	  current_char32 = std::get<0> (utf8_escape_pair);

	  if (current_char32 == Codepoint (0) && std::get<2> (utf8_escape_pair))
	    length = std::get<1> (utf8_escape_pair) - 1;
	  else
	    length += std::get<1> (utf8_escape_pair);

	  if (current_char32 != Codepoint (0)
	      || !std::get<2> (utf8_escape_pair))
	    str += current_char32;

	  // required as parsing utf8 escape only changes current_char
	  current_char32 = peek_codepoint_input ();

	  continue;
	}

      length += get_input_codepoint_length ();

      str += current_char32;
      skip_codepoint_input ();
      current_char32 = peek_codepoint_input ();
    }

  current_column += length;

  if (current_char32.value == '"')
    {
      current_column++;

      skip_input ();
      current_char = peek_input ();
    }
  else if (current_char32.is_eof ())
    {
      rust_error_at (get_current_location (), "unended string literal");
      return Token::make (END_OF_FILE, get_current_location ());
    }
  else
    {
      gcc_unreachable ();
    }

  str.shrink_to_fit ();
  return Token::make_string (loc, std::move (str));
}

// Parses an identifier or keyword.
TokenPtr
Lexer::parse_identifier_or_keyword (Location loc)
{
  std::string str;
  str.reserve (16); // default
  str += current_char;

  bool first_is_underscore = current_char == '_';

  int length = 1;
  current_char = peek_input ();
  // loop through entire name
  while (ISALPHA (current_char) || ISDIGIT (current_char)
	 || current_char == '_')
    {
      length++;

      str += current_char;
      skip_input ();
      current_char = peek_input ();
    }

  current_column += length;

  // if just a single underscore, not an identifier
  if (first_is_underscore && length == 1)
    return Token::make (UNDERSCORE, loc);

  str.shrink_to_fit ();

  TokenId keyword = classify_keyword (str);
  if (keyword == IDENTIFIER)
    return Token::make_identifier (loc, std::move (str));
  else
    return Token::make (keyword, loc);
}

// Possibly returns a raw string token if it exists - otherwise returns null.
TokenPtr
Lexer::maybe_parse_raw_string (Location loc)
{
  int peek_index = 0;
  while (peek_input (peek_index) == '#')
    peek_index++;

  if (peek_input (peek_index) == '"')
    return parse_raw_string (loc, peek_index);
  else
    return nullptr;
}

// Returns a raw string token.
TokenPtr
Lexer::parse_raw_string (Location loc, int initial_hash_count)
{
  // raw string literals
  std::string str;
  str.reserve (16); // some sensible default

  int length = 1 + initial_hash_count;

  if (initial_hash_count > 0)
    skip_input (initial_hash_count - 1);

  current_char = peek_input ();

  if (current_char != '"')
    rust_error_at (get_current_location (), "raw string has no opening %<\"%>");

  length++;
  skip_input ();
  Codepoint current_char32 = peek_codepoint_input ();

  while (!current_char32.is_eof ())
    {
      if (current_char32.value == '"')
	{
	  bool enough_hashes = true;

	  for (int i = 0; i < initial_hash_count; i++)
	    {
	      if (peek_input (i + 1) != '#')
		{
		  enough_hashes = false;
		  break;
		}
	    }

	  if (enough_hashes)
	    {
	      // skip enough input and peek enough input
	      skip_input (initial_hash_count);
	      current_char = peek_input ();
	      length += initial_hash_count + 1;
	      break;
	    }
	}

      length++;

      str += current_char32;
      skip_codepoint_input ();
      current_char32 = peek_codepoint_input ();
    }

  current_column += length;

  str.shrink_to_fit ();

  return Token::make_string (loc, std::move (str));
}

template <typename IsDigitFunc>
TokenPtr
Lexer::parse_non_decimal_int_literal (Location loc, IsDigitFunc is_digit_func,
				      std::string existent_str, int base)
{
  int length = 1;

  skip_input ();
  current_char = peek_input ();

  length++;

  // loop through to add entire number to string
  while (is_digit_func (current_char) || current_char == '_')
    {
      if (current_char == '_')
	{
	  // don't add _ to number
	  skip_input ();
	  current_char = peek_input ();

	  length++;

	  continue;
	}

      length++;

      // add raw numbers
      existent_str += current_char;
      skip_input ();
      current_char = peek_input ();
    }

  // convert value to decimal representation
  long dec_num = std::strtol (existent_str.c_str (), nullptr, base);

  existent_str = std::to_string (dec_num);

  // parse in type suffix if it exists
  auto type_suffix_pair = parse_in_type_suffix ();
  PrimitiveCoreType type_hint = type_suffix_pair.first;
  length += type_suffix_pair.second;

  current_column += length;

  if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64)
    {
      rust_error_at (get_current_location (),
		     "invalid type suffix %qs for integer (%s) literal",
		     get_type_hint_string (type_hint),
		     base == 16
		       ? "hex"
		       : (base == 8 ? "octal"
				    : (base == 2 ? "binary"
						 : "<insert unknown base>")));
      return nullptr;
    }
  return Token::make_int (loc, std::move (existent_str), type_hint);
}

// Parses a hex, binary or octal int literal.
TokenPtr
Lexer::parse_non_decimal_int_literals (Location loc)
{
  std::string str;
  str.reserve (16); // some sensible default
  str += current_char;

  current_char = peek_input ();

  if (current_char == 'x')
    {
      // hex (integer only)
      return parse_non_decimal_int_literal (loc, is_x_digit, str + "x", 16);
    }
  else if (current_char == 'o')
    {
      // octal (integer only)
      return parse_non_decimal_int_literal (loc, is_octal_digit,
					    std::move (str), 8);
    }
  else if (current_char == 'b')
    {
      // binary (integer only)
      return parse_non_decimal_int_literal (loc, is_bin_digit, std::move (str),
					    2);
    }
  else
    {
      return nullptr;
    }
}

// Parses a decimal-based int literal or float literal.
TokenPtr
Lexer::parse_decimal_int_or_float (Location loc)
{
  std::string str;
  str.reserve (16); // some sensible default
  str += current_char;

  int length = 1;
  bool first_zero = current_char == '0';

  current_char = peek_input ();

  // parse initial decimal integer (or first integer part of float) literal
  auto initial_decimal = parse_in_decimal ();
  str += std::get<0> (initial_decimal);
  length += std::get<1> (initial_decimal);

  // detect float literal
  if (current_char == '.' && is_float_digit (peek_input (1)))
    {
      // float with a '.', parse another decimal into it

      // add . to str
      str += current_char;
      skip_input ();
      current_char = peek_input ();
      length++;

      // parse another decimal number for float
      auto second_decimal = parse_in_decimal ();
      str += std::get<0> (second_decimal);
      length += std::get<1> (second_decimal);

      // parse in exponent part if it exists
      auto exponent_pair = parse_in_exponent_part ();
      str += exponent_pair.first;
      length += exponent_pair.second;

      // parse in type suffix if it exists
      auto type_suffix_pair = parse_in_type_suffix ();
      PrimitiveCoreType type_hint = type_suffix_pair.first;
      length += type_suffix_pair.second;

      if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
	  && type_hint != CORETYPE_UNKNOWN)
	{
	  rust_error_at (get_current_location (),
			 "invalid type suffix %qs for floating-point literal",
			 get_type_hint_string (type_hint));
	  // ignore invalid type suffix as everything else seems fine
	  type_hint = CORETYPE_UNKNOWN;
	}

      current_column += length;

      str.shrink_to_fit ();
      return Token::make_float (loc, std::move (str), type_hint);
    }
  else if (current_char == '.' && check_valid_float_dot_end (peek_input (1)))
    {
      // float that is just an integer with a terminating '.' character

      // add . to str
      str += current_char;
      skip_input ();
      current_char = peek_input ();
      length++;

      // add a '0' after the . to prevent ambiguity
      str += '0';

      // type hint not allowed

      current_column += length;

      str.shrink_to_fit ();
      return Token::make_float (loc, std::move (str), CORETYPE_UNKNOWN);
    }
  else if (current_char == 'E' || current_char == 'e')
    {
      // exponent float with no '.' character

      // parse exponent part
      auto exponent_pair = parse_in_exponent_part ();
      str += exponent_pair.first;
      length += exponent_pair.second;

      // parse in type suffix if it exists
      auto type_suffix_pair = parse_in_type_suffix ();
      PrimitiveCoreType type_hint = type_suffix_pair.first;
      length += type_suffix_pair.second;

      if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
	  && type_hint != CORETYPE_UNKNOWN)
	{
	  rust_error_at (get_current_location (),
			 "invalid type suffix %qs for floating-point literal",
			 get_type_hint_string (type_hint));
	  // ignore invalid type suffix as everything else seems fine
	  type_hint = CORETYPE_UNKNOWN;
	}

      current_column += length;

      str.shrink_to_fit ();
      return Token::make_float (loc, std::move (str), type_hint);
    }
  else
    {
      // is an integer

      // parse in type suffix if it exists
      auto type_suffix_pair = parse_in_type_suffix ();
      PrimitiveCoreType type_hint = type_suffix_pair.first;
      /* A "real" pure decimal doesn't have a suffix and no zero prefix.  */
      if (type_hint == CORETYPE_UNKNOWN)
	{
	  bool pure_decimal = std::get<2> (initial_decimal);
	  if (pure_decimal && (!first_zero || str.size () == 1))
	    type_hint = CORETYPE_PURE_DECIMAL;
	}
      length += type_suffix_pair.second;

      current_column += length;

      str.shrink_to_fit ();
      return Token::make_int (loc, std::move (str), type_hint);
    }
}

TokenPtr
Lexer::parse_char_or_lifetime (Location loc)
{
  Codepoint current_char32;

  int length = 1;

  current_char32 = peek_codepoint_input ();
  if (current_char32.is_eof ())
    return nullptr;

  // parse escaped char literal
  if (current_char32.value == '\\')
    {
      // parse escape
      auto utf8_escape_pair = parse_utf8_escape ('\'');
      current_char32 = std::get<0> (utf8_escape_pair);
      length += std::get<1> (utf8_escape_pair);

      if (peek_codepoint_input ().value != '\'')
	{
	  rust_error_at (get_current_location (), "unended character literal");
	}
      else
	{
	  skip_codepoint_input ();
	  current_char = peek_input ();
	  length++;
	}

      current_column += length;

      return Token::make_char (loc, current_char32);
    }
  else
    {
      skip_codepoint_input ();

      if (peek_codepoint_input ().value == '\'')
	{
	  // parse non-escaped char literal

	  // skip the ' character
	  skip_input ();
	  current_char = peek_input ();

	  // TODO fix due to different widths of utf-8 chars?
	  current_column += 3;

	  return Token::make_char (loc, current_char32);
	}
      else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value)
	       || current_char32.value == '_')
	{
	  // parse lifetime name
	  std::string str;
	  str += current_char32;
	  length++;

	  current_char = peek_input ();
	  while (ISDIGIT (current_char) || ISALPHA (current_char)
		 || current_char == '_')
	    {
	      str += current_char;
	      skip_input ();
	      current_char = peek_input ();
	      length++;
	    }

	  current_column += length;

	  str.shrink_to_fit ();
	  return Token::make_lifetime (loc, std::move (str));
	}
      else
	{
	  rust_error_at (
	    get_current_location (),
	    "expected %' after character constant in character literal");
	  return nullptr;
	}
    }
}

// Returns the length of the codepoint at the current position.
int
Lexer::get_input_codepoint_length ()
{
  uint8_t input = peek_input ();

  if ((int8_t) input == EOF)
    return 0;

  if (input < 128)
    {
      // ascii -- 1 byte
      // return input;

      return 1;
    }
  else if ((input & 0xC0) == 0x80)
    {
      // invalid (continuation; can't be first char)
      // return 0xFFFE;

      return 0;
    }
  else if ((input & 0xE0) == 0xC0)
    {
      // 2 bytes
      uint8_t input2 = peek_input (1);
      if ((input2 & 0xC0) != 0x80)
	return 0;
      // return 0xFFFE;

      // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
      // return output;
      return 2;
    }
  else if ((input & 0xF0) == 0xE0)
    {
      // 3 bytes
      uint8_t input2 = peek_input (1);
      if ((input2 & 0xC0) != 0x80)
	return 0;
      // return 0xFFFE;

      uint8_t input3 = peek_input (2);
      if ((input3 & 0xC0) != 0x80)
	return 0;
      // return 0xFFFE;

      /*uint32_t output
	= ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
      0); return output;*/
      return 3;
    }
  else if ((input & 0xF8) == 0xF0)
    {
      // 4 bytes
      uint8_t input2 = peek_input (1);
      if ((input2 & 0xC0) != 0x80)
	return 0;
      // return 0xFFFE;

      uint8_t input3 = peek_input (2);
      if ((input3 & 0xC0) != 0x80)
	return 0;
      // return 0xFFFE;

      uint8_t input4 = peek_input (3);
      if ((input4 & 0xC0) != 0x80)
	return 0;
      // return 0xFFFE;

      /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
			| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
      return output;*/
      return 4;
    }
  else
    {
      rust_error_at (get_current_location (),
		     "invalid UTF-8 [FIRST] (too long)");
      return 0;
    }
}

// Returns the codepoint at the current position.
Codepoint
Lexer::peek_codepoint_input ()
{
  uint8_t input = peek_input ();

  if ((int8_t) input == EOF)
    return Codepoint::eof ();

  if (input < 128)
    {
      // ascii -- 1 byte
      return {input};
    }
  else if ((input & 0xC0) == 0x80)
    {
      // invalid (continuation; can't be first char)
      return {0xFFFE};
    }
  else if ((input & 0xE0) == 0xC0)
    {
      // 2 bytes
      uint8_t input2 = peek_input (1);
      if ((input2 & 0xC0) != 0x80)
	return {0xFFFE};

      uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
      return {output};
    }
  else if ((input & 0xF0) == 0xE0)
    {
      // 3 bytes
      uint8_t input2 = peek_input (1);
      if ((input2 & 0xC0) != 0x80)
	return {0xFFFE};

      uint8_t input3 = peek_input (2);
      if ((input3 & 0xC0) != 0x80)
	return {0xFFFE};

      uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
			| ((input3 & 0x3F) << 0);
      return {output};
    }
  else if ((input & 0xF8) == 0xF0)
    {
      // 4 bytes
      uint8_t input2 = peek_input (1);
      if ((input2 & 0xC0) != 0x80)
	return {0xFFFE};

      uint8_t input3 = peek_input (2);
      if ((input3 & 0xC0) != 0x80)
	return {0xFFFE};

      uint8_t input4 = peek_input (3);
      if ((input4 & 0xC0) != 0x80)
	return {0xFFFE};

      uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
			| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
      return {output};
    }
  else
    {
      rust_error_at (get_current_location (),
		     "invalid UTF-8 [SECND] (too long)");
      return {0xFFFE};
    }
}

void
Lexer::skip_codepoint_input ()
{
  int toSkip = get_input_codepoint_length ();
  gcc_assert (toSkip >= 1);

  skip_input (toSkip - 1);
}

int
Lexer::test_get_input_codepoint_n_length (int n_start_offset)
{
  uint8_t input = peek_input (n_start_offset);

  if (input < 128)
    {
      // ascii -- 1 byte
      // return input;
      return 1;
    }
  else if ((input & 0xC0) == 0x80)
    {
      // invalid (continuation; can't be first char)
      // return 0xFFFE;
      return 0;
    }
  else if ((input & 0xE0) == 0xC0)
    {
      // 2 bytes
      uint8_t input2 = peek_input (n_start_offset + 1);
      if ((input2 & 0xC0) != 0x80)
	// return 0xFFFE;
	return 0;

      // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
      // return output;
      return 2;
    }
  else if ((input & 0xF0) == 0xE0)
    {
      // 3 bytes
      uint8_t input2 = peek_input (n_start_offset + 1);
      if ((input2 & 0xC0) != 0x80)
	// return 0xFFFE;
	return 0;

      uint8_t input3 = peek_input (n_start_offset + 2);
      if ((input3 & 0xC0) != 0x80)
	// return 0xFFFE;
	return 0;

      /*uint32_t output
	= ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
      0); return output;*/
      return 3;
    }
  else if ((input & 0xF8) == 0xF0)
    {
      // 4 bytes
      uint8_t input2 = peek_input (n_start_offset + 1);
      if ((input2 & 0xC0) != 0x80)
	// return 0xFFFE;
	return 0;

      uint8_t input3 = peek_input (n_start_offset + 2);
      if ((input3 & 0xC0) != 0x80)
	// return 0xFFFE;
	return 0;

      uint8_t input4 = peek_input (n_start_offset + 3);
      if ((input4 & 0xC0) != 0x80)
	// return 0xFFFE;
	return 0;

      /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
			| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
      return output;*/
      return 4;
    }
  else
    {
      rust_error_at (get_current_location (),
		     "invalid UTF-8 [THIRD] (too long)");
      return 0;
    }
}

// peeks the codepoint input at n codepoints ahead of current codepoint - try
// not to use
Codepoint
Lexer::test_peek_codepoint_input (int n)
{
  int totalOffset = 0;

  // add up all offsets into total offset? does this do what I want?
  for (int i = 0; i < n; i++)
    {
      totalOffset += test_get_input_codepoint_n_length (totalOffset);
    }
  // issues: this would have (at least) O(n) lookup time, not O(1) like the
  // rest?

  // TODO: implement if still needed

  // error out of function as it is not implemented
  gcc_assert (1 == 0);
  return {0};
  /*
	  uint8_t input = peek_input();

	  if (input < 128) {
	      // ascii -- 1 byte
	      return input;
	  } else if ((input & 0xC0) == 0x80) {
	      // invalid (continuation; can't be first char)
	      return 0xFFFE;
	  } else if ((input & 0xE0) == 0xC0) {
	      // 2 bytes
	      uint8_t input2 = peek_input(1);
	      if ((input2 & 0xC0) != 0x80)
		  return 0xFFFE;

	      uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
	      return output;
	  } else if ((input & 0xF0) == 0xE0) {
	      // 3 bytes
	      uint8_t input2 = peek_input(1);
	      if ((input2 & 0xC0) != 0x80)
		  return 0xFFFE;

	      uint8_t input3 = peek_input(2);
	      if ((input3 & 0xC0) != 0x80)
		  return 0xFFFE;

	      uint32_t output
		= ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 &
     0x3F) << 0); return output; } else if ((input & 0xF8) == 0xF0) {
	      // 4 bytes
	      uint8_t input2 = peek_input(1);
	      if ((input2 & 0xC0) != 0x80)
		  return 0xFFFE;

	      uint8_t input3 = peek_input(2);
	      if ((input3 & 0xC0) != 0x80)
		  return 0xFFFE;

	      uint8_t input4 = peek_input(3);
	      if ((input4 & 0xC0) != 0x80)
		  return 0xFFFE;

	      uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
				| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) <<
     0); return output; } else { rust_error_at(get_current_location(), "invalid
     UTF-8 (too long)"); return 0xFFFE;
	  }*/
}

void
Lexer::split_current_token (TokenId new_left, TokenId new_right)
{
  /* TODO: assert that this TokenId is a "simple token" like punctuation and not
   * like "IDENTIFIER"? */
  Location current_loc = peek_token ()->get_locus ();
  TokenPtr new_left_tok = Token::make (new_left, current_loc);
  TokenPtr new_right_tok = Token::make (new_right, current_loc + 1);

  token_queue.replace_current_value (std::move (new_left_tok));
  token_queue.insert (1, std::move (new_right_tok));
}

void
Lexer::start_line (int current_line, int current_column)
{
  if (line_map)
    line_map->start_line (current_line, current_column);
}

} // namespace Rust