[thirdparty/gcc.git] / gcc / ada / s-utf_32.ads

------------------------------------------------------------------------------
--                                                                          --
--                         GNAT RUN-TIME COMPONENTS                         --
--                                                                          --
--                        S Y S T E M . U T F _ 3 2                         --
--                                                                          --
--                                 S p e c                                  --
--                                                                          --
--          Copyright (C) 2005-2012, Free Software Foundation, Inc.         --
--                                                                          --
-- GNAT is free software;  you can  redistribute it  and/or modify it under --
-- terms of the  GNU General Public License as published  by the Free Soft- --
-- ware  Foundation;  either version 3,  or (at your option) any later ver- --
-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
--                                                                          --
-- As a special exception under Section 7 of GPL version 3, you are granted --
-- additional permissions described in the GCC Runtime Library Exception,   --
-- version 3.1, as published by the Free Software Foundation.               --
--                                                                          --
-- You should have received a copy of the GNU General Public License and    --
-- a copy of the GCC Runtime Library Exception along with this program;     --
-- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
-- <http://www.gnu.org/licenses/>.                                          --
--                                                                          --
-- GNAT was originally developed  by the GNAT team at  New York University. --
-- Extensive contributions were provided by Ada Core Technologies Inc.      --
--                                                                          --
------------------------------------------------------------------------------

--  This package is an internal package that provides basic character
--  classification capabilities needed by the compiler for handling full
--  32-bit wide wide characters. We avoid the use of the actual type
--  Wide_Wide_Character, since we want to use these routines in the compiler
--  itself, and we want to be able to compile the compiler with old versions
--  of GNAT that did not implement Wide_Wide_Character.

--  System.UTF_32 should not be directly used from an application program, but
--  an equivalent package GNAT.UTF_32 can be used directly and provides exactly
--  the same services. The reason this package is in System is so that it can
--  with'ed by other packages in the Ada and System hierarchies.

pragma Compiler_Unit;

package System.UTF_32 is
   pragma Pure;

   type UTF_32 is range 0 .. 16#7FFF_FFFF#;
   --  So far, the only defined character codes are in 0 .. 16#01_FFFF#

   --  The following type defines the categories from the unicode definitions.
   --  The one addition we make is Fe, which represents the characters FFFE
   --  and FFFF in any of the planes.

   type Category is (
     Cc,   --  Other, Control
     Cf,   --  Other, Format
     Cn,   --  Other, Not Assigned
     Co,   --  Other, Private Use
     Cs,   --  Other, Surrogate
     Ll,   --  Letter, Lowercase
     Lm,   --  Letter, Modifier
     Lo,   --  Letter, Other
     Lt,   --  Letter, Titlecase
     Lu,   --  Letter, Uppercase
     Mc,   --  Mark, Spacing Combining
     Me,   --  Mark, Enclosing
     Mn,   --  Mark, Nonspacing
     Nd,   --  Number, Decimal Digit
     Nl,   --  Number, Letter
     No,   --  Number, Other
     Pc,   --  Punctuation, Connector
     Pd,   --  Punctuation, Dash
     Pe,   --  Punctuation, Close
     Pf,   --  Punctuation, Final quote
     Pi,   --  Punctuation, Initial quote
     Po,   --  Punctuation, Other
     Ps,   --  Punctuation, Open
     Sc,   --  Symbol, Currency
     Sk,   --  Symbol, Modifier
     Sm,   --  Symbol, Math
     So,   --  Symbol, Other
     Zl,   --  Separator, Line
     Zp,   --  Separator, Paragraph
     Zs,   --  Separator, Space
     Fe);  --  relative position FFFE/FFFF in any plane

   function Get_Category (U : UTF_32) return Category;
   --  Given a UTF32 code, returns corresponding Category, or Cn if
   --  the code does not have an assigned unicode category.

   --  The following functions perform category tests corresponding to lexical
   --  classes defined in the Ada standard. There are two interfaces for each
   --  function. The second takes a Category (e.g. returned by Get_Category).
   --  The first takes a UTF_32 code. The form taking the UTF_32 code is
   --  typically more efficient than calling Get_Category, but if several
   --  different tests are to be performed on the same code, it is more
   --  efficient to use Get_Category to get the category, then test the
   --  resulting category.

   function Is_UTF_32_Letter (U : UTF_32)   return Boolean;
   function Is_UTF_32_Letter (C : Category) return Boolean;
   pragma Inline (Is_UTF_32_Letter);
   --  Returns true iff U is a letter that can be used to start an identifier,
   --  or if C is one of the corresponding categories, which are the following:
   --    Letter, Uppercase (Lu)
   --    Letter, Lowercase (Ll)
   --    Letter, Titlecase (Lt)
   --    Letter, Modifier  (Lm)
   --    Letter, Other     (Lo)
   --    Number, Letter    (Nl)

   function Is_UTF_32_Digit (U : UTF_32)   return Boolean;
   function Is_UTF_32_Digit (C : Category) return Boolean;
   pragma Inline (Is_UTF_32_Digit);
   --  Returns true iff U is a digit that can be used to extend an identifier,
   --  or if C is one of the corresponding categories, which are the following:
   --    Number, Decimal_Digit (Nd)

   function Is_UTF_32_Line_Terminator (U : UTF_32) return Boolean;
   pragma Inline (Is_UTF_32_Line_Terminator);
   --  Returns true iff U is an allowed line terminator for source programs,
   --  if U is in the category Zp (Separator, Paragraph), or Zl (Separator,
   --  Line), or if U is a conventional line terminator (CR, LF, VT, FF).
   --  There is no category version for this function, since the set of
   --  characters does not correspond to a set of Unicode categories.

   function Is_UTF_32_Mark (U : UTF_32)   return Boolean;
   function Is_UTF_32_Mark (C : Category) return Boolean;
   pragma Inline (Is_UTF_32_Mark);
   --  Returns true iff U is a mark character which can be used to extend an
   --  identifier, or if C is one of the corresponding categories, which are
   --  the following:
   --    Mark, Non-Spacing (Mn)
   --    Mark, Spacing Combining (Mc)

   function Is_UTF_32_Other (U : UTF_32)   return Boolean;
   function Is_UTF_32_Other (C : Category) return Boolean;
   pragma Inline (Is_UTF_32_Other);
   --  Returns true iff U is an other format character, which means that it
   --  can be used to extend an identifier, but is ignored for the purposes of
   --  matching of identifiers, or if C is one of the corresponding categories,
   --  which are the following:
   --    Other, Format (Cf)

   function Is_UTF_32_Punctuation (U : UTF_32)   return Boolean;
   function Is_UTF_32_Punctuation (C : Category) return Boolean;
   pragma Inline (Is_UTF_32_Punctuation);
   --  Returns true iff U is a punctuation character that can be used to
   --  separate pieces of an identifier, or if C is one of the corresponding
   --  categories, which are the following:
   --    Punctuation, Connector (Pc)

   function Is_UTF_32_Space (U : UTF_32)   return Boolean;
   function Is_UTF_32_Space (C : Category) return Boolean;
   pragma Inline (Is_UTF_32_Space);
   --  Returns true iff U is considered a space to be ignored, or if C is one
   --  of the corresponding categories, which are the following:
   --    Separator, Space (Zs)

   function Is_UTF_32_Non_Graphic (U : UTF_32)   return Boolean;
   function Is_UTF_32_Non_Graphic (C : Category) return Boolean;
   pragma Inline (Is_UTF_32_Non_Graphic);
   --  Returns true iff U is considered to be a non-graphic character, or if C
   --  is one of the corresponding categories, which are the following:
   --    Other, Control (Cc)
   --    Other, Private Use (Co)
   --    Other, Surrogate (Cs)
   --    Separator, Line (Zl)
   --    Separator, Paragraph (Zp)
   --    FFFE or FFFF positions in any plane (Fe)
   --
   --  Note that the Ada category format effector is subsumed by the above
   --  list of Unicode categories.
   --
   --  Note that Other, Unassigned (Cn) is quite deliberately not included
   --  in the list of categories above. This means that should any of these
   --  code positions be defined in future with graphic characters they will
   --  be allowed without a need to change implementations or the standard.
   --
   --  Note that Other, Format (Cf) is also quite deliberately not included
   --  in the list of categories above. This means that these characters can
   --  be included in character and string literals.

   --  The following function is used to fold to upper case, as required by
   --  the Ada 2005 standard rules for identifier case folding. Two
   --  identifiers are equivalent if they are identical after folding all
   --  letters to upper case using this routine. A corresponding routine to
   --  fold to lower case is also provided.

   function UTF_32_To_Lower_Case (U : UTF_32) return UTF_32;
   pragma Inline (UTF_32_To_Lower_Case);
   --  If U represents an upper case letter, returns the corresponding lower
   --  case letter, otherwise U is returned unchanged. The folding rule is
   --  simply that if the code corresponds to a 10646 entry whose name contains
   --  the string CAPITAL LETTER, and there is a corresponding entry whose name
   --  is the same but with CAPITAL LETTER replaced by SMALL LETTER, then the
   --  code is folded to this SMALL LETTER code. Otherwise the input code is
   --  returned unchanged.

   function UTF_32_To_Upper_Case (U : UTF_32) return UTF_32;
   pragma Inline (UTF_32_To_Upper_Case);
   --  If U represents a lower case letter, returns the corresponding lower
   --  case letter, otherwise U is returned unchanged. The folding rule is
   --  simply that if the code corresponds to a 10646 entry whose name contains
   --  the string SMALL LETTER, and there is a corresponding entry whose name
   --  is the same but with SMALL LETTER replaced by CAPITAL LETTER, then the
   --  code is folded to this CAPITAL LETTER code. Otherwise the input code is
   --  returned unchanged.

end System.UTF_32;
Commit	Line	Data
2754ce81	1	------------------------------------------------------------------------------
	2	-- --
	3	-- GNAT RUN-TIME COMPONENTS --
	4	-- --
	5	-- S Y S T E M . U T F _ 3 2 --
	6	-- --
	7	-- S p e c --
	8	-- --
71e45bc2	9	-- Copyright (C) 2005-2012, Free Software Foundation, Inc. --
2754ce81	10	-- --
	11	-- GNAT is free software; you can redistribute it and/or modify it under --
	12	-- terms of the GNU General Public License as published by the Free Soft- --
6bc9506f	13	-- ware Foundation; either version 3, or (at your option) any later ver- --
2754ce81	14	-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
2754ce81	15	-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
6bc9506f	16	-- or FITNESS FOR A PARTICULAR PURPOSE. --
	17	-- --
	18	-- As a special exception under Section 7 of GPL version 3, you are granted --
	19	-- additional permissions described in the GCC Runtime Library Exception, --
	20	-- version 3.1, as published by the Free Software Foundation. --
	21	-- --
	22	-- You should have received a copy of the GNU General Public License and --
	23	-- a copy of the GCC Runtime Library Exception along with this program; --
	24	-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
	25	-- <http://www.gnu.org/licenses/>. --
2754ce81	26	-- --
	27	-- GNAT was originally developed by the GNAT team at New York University. --
	28	-- Extensive contributions were provided by Ada Core Technologies Inc. --
	29	-- --
	30	------------------------------------------------------------------------------
	31
	32	-- This package is an internal package that provides basic character
	33	-- classification capabilities needed by the compiler for handling full
	34	-- 32-bit wide wide characters. We avoid the use of the actual type
	35	-- Wide_Wide_Character, since we want to use these routines in the compiler
	36	-- itself, and we want to be able to compile the compiler with old versions
	37	-- of GNAT that did not implement Wide_Wide_Character.
	38
	39	-- System.UTF_32 should not be directly used from an application program, but
	40	-- an equivalent package GNAT.UTF_32 can be used directly and provides exactly
	41	-- the same services. The reason this package is in System is so that it can
	42	-- with'ed by other packages in the Ada and System hierarchies.
	43
2a2c0bda	44	pragma Compiler_Unit;
2a2c0bda	45
2754ce81	46	package System.UTF_32 is
c8e4bf42	47	pragma Pure;
2754ce81	48
	49	type UTF_32 is range 0 .. 16#7FFF_FFFF#;
	50	-- So far, the only defined character codes are in 0 .. 16#01_FFFF#
	51
	52	-- The following type defines the categories from the unicode definitions.
	53	-- The one addition we make is Fe, which represents the characters FFFE
	54	-- and FFFF in any of the planes.
	55
	56	type Category is (
	57	Cc, -- Other, Control
	58	Cf, -- Other, Format
	59	Cn, -- Other, Not Assigned
	60	Co, -- Other, Private Use
	61	Cs, -- Other, Surrogate
	62	Ll, -- Letter, Lowercase
	63	Lm, -- Letter, Modifier
	64	Lo, -- Letter, Other
	65	Lt, -- Letter, Titlecase
	66	Lu, -- Letter, Uppercase
	67	Mc, -- Mark, Spacing Combining
	68	Me, -- Mark, Enclosing
	69	Mn, -- Mark, Nonspacing
	70	Nd, -- Number, Decimal Digit
	71	Nl, -- Number, Letter
	72	No, -- Number, Other
	73	Pc, -- Punctuation, Connector
	74	Pd, -- Punctuation, Dash
	75	Pe, -- Punctuation, Close
	76	Pf, -- Punctuation, Final quote
	77	Pi, -- Punctuation, Initial quote
	78	Po, -- Punctuation, Other
	79	Ps, -- Punctuation, Open
	80	Sc, -- Symbol, Currency
	81	Sk, -- Symbol, Modifier
	82	Sm, -- Symbol, Math
	83	So, -- Symbol, Other
	84	Zl, -- Separator, Line
	85	Zp, -- Separator, Paragraph
	86	Zs, -- Separator, Space
	87	Fe); -- relative position FFFE/FFFF in any plane
	88
	89	function Get_Category (U : UTF_32) return Category;
	90	-- Given a UTF32 code, returns corresponding Category, or Cn if
	91	-- the code does not have an assigned unicode category.
	92
	93	-- The following functions perform category tests corresponding to lexical
	94	-- classes defined in the Ada standard. There are two interfaces for each
	95	-- function. The second takes a Category (e.g. returned by Get_Category).
	96	-- The first takes a UTF_32 code. The form taking the UTF_32 code is
	97	-- typically more efficient than calling Get_Category, but if several
	98	-- different tests are to be performed on the same code, it is more
	99	-- efficient to use Get_Category to get the category, then test the
	100	-- resulting category.
	101
	102	function Is_UTF_32_Letter (U : UTF_32) return Boolean;
	103	function Is_UTF_32_Letter (C : Category) return Boolean;
	104	pragma Inline (Is_UTF_32_Letter);
	105	-- Returns true iff U is a letter that can be used to start an identifier,
	106	-- or if C is one of the corresponding categories, which are the following:
	107	-- Letter, Uppercase (Lu)
	108	-- Letter, Lowercase (Ll)
	109	-- Letter, Titlecase (Lt)
	110	-- Letter, Modifier (Lm)
	111	-- Letter, Other (Lo)
112	-- Number, Letter (Nl)
113
114	function Is_UTF_32_Digit (U : UTF_32) return Boolean;
115	function Is_UTF_32_Digit (C : Category) return Boolean;
116	pragma Inline (Is_UTF_32_Digit);
febb409f	117	-- Returns true iff U is a digit that can be used to extend an identifier,
2754ce81	118	-- or if C is one of the corresponding categories, which are the following:
	119	-- Number, Decimal_Digit (Nd)
	120
	121	function Is_UTF_32_Line_Terminator (U : UTF_32) return Boolean;
	122	pragma Inline (Is_UTF_32_Line_Terminator);
	123	-- Returns true iff U is an allowed line terminator for source programs,
cb395c05	124	-- if U is in the category Zp (Separator, Paragraph), or Zl (Separator,
2754ce81	125	-- Line), or if U is a conventional line terminator (CR, LF, VT, FF).
	126	-- There is no category version for this function, since the set of
	127	-- characters does not correspond to a set of Unicode categories.
	128
	129	function Is_UTF_32_Mark (U : UTF_32) return Boolean;
	130	function Is_UTF_32_Mark (C : Category) return Boolean;
	131	pragma Inline (Is_UTF_32_Mark);
	132	-- Returns true iff U is a mark character which can be used to extend an
	133	-- identifier, or if C is one of the corresponding categories, which are
	134	-- the following:
	135	-- Mark, Non-Spacing (Mn)
	136	-- Mark, Spacing Combining (Mc)
	137
	138	function Is_UTF_32_Other (U : UTF_32) return Boolean;
	139	function Is_UTF_32_Other (C : Category) return Boolean;
	140	pragma Inline (Is_UTF_32_Other);
	141	-- Returns true iff U is an other format character, which means that it
	142	-- can be used to extend an identifier, but is ignored for the purposes of
febb409f	143	-- matching of identifiers, or if C is one of the corresponding categories,
2754ce81	144	-- which are the following:
	145	-- Other, Format (Cf)
	146
	147	function Is_UTF_32_Punctuation (U : UTF_32) return Boolean;
	148	function Is_UTF_32_Punctuation (C : Category) return Boolean;
	149	pragma Inline (Is_UTF_32_Punctuation);
	150	-- Returns true iff U is a punctuation character that can be used to
febb409f	151	-- separate pieces of an identifier, or if C is one of the corresponding
2754ce81	152	-- categories, which are the following:
	153	-- Punctuation, Connector (Pc)
	154
	155	function Is_UTF_32_Space (U : UTF_32) return Boolean;
	156	function Is_UTF_32_Space (C : Category) return Boolean;
	157	pragma Inline (Is_UTF_32_Space);
	158	-- Returns true iff U is considered a space to be ignored, or if C is one
	159	-- of the corresponding categories, which are the following:
	160	-- Separator, Space (Zs)
	161
	162	function Is_UTF_32_Non_Graphic (U : UTF_32) return Boolean;
	163	function Is_UTF_32_Non_Graphic (C : Category) return Boolean;
	164	pragma Inline (Is_UTF_32_Non_Graphic);
	165	-- Returns true iff U is considered to be a non-graphic character, or if C
	166	-- is one of the corresponding categories, which are the following:
	167	-- Other, Control (Cc)
	168	-- Other, Private Use (Co)
	169	-- Other, Surrogate (Cs)
	170	-- Separator, Line (Zl)
	171	-- Separator, Paragraph (Zp)
	172	-- FFFE or FFFF positions in any plane (Fe)
	173	--
	174	-- Note that the Ada category format effector is subsumed by the above
	175	-- list of Unicode categories.
	176	--
febb409f	177	-- Note that Other, Unassigned (Cn) is quite deliberately not included
2754ce81	178	-- in the list of categories above. This means that should any of these
	179	-- code positions be defined in future with graphic characters they will
	180	-- be allowed without a need to change implementations or the standard.
	181	--
	182	-- Note that Other, Format (Cf) is also quite deliberately not included
	183	-- in the list of categories above. This means that these characters can
	184	-- be included in character and string literals.
	185
	186	-- The following function is used to fold to upper case, as required by
	187	-- the Ada 2005 standard rules for identifier case folding. Two
	188	-- identifiers are equivalent if they are identical after folding all
bb0ed4ab	189	-- letters to upper case using this routine. A corresponding routine to
	190	-- fold to lower case is also provided.
	191
	192	function UTF_32_To_Lower_Case (U : UTF_32) return UTF_32;
	193	pragma Inline (UTF_32_To_Lower_Case);
	194	-- If U represents an upper case letter, returns the corresponding lower
	195	-- case letter, otherwise U is returned unchanged. The folding rule is
	196	-- simply that if the code corresponds to a 10646 entry whose name contains
	197	-- the string CAPITAL LETTER, and there is a corresponding entry whose name
	198	-- is the same but with CAPITAL LETTER replaced by SMALL LETTER, then the
	199	-- code is folded to this SMALL LETTER code. Otherwise the input code is
	200	-- returned unchanged.
2754ce81	201
	202	function UTF_32_To_Upper_Case (U : UTF_32) return UTF_32;
	203	pragma Inline (UTF_32_To_Upper_Case);
bb0ed4ab	204	-- If U represents a lower case letter, returns the corresponding lower
	205	-- case letter, otherwise U is returned unchanged. The folding rule is
	206	-- simply that if the code corresponds to a 10646 entry whose name contains
	207	-- the string SMALL LETTER, and there is a corresponding entry whose name
	208	-- is the same but with SMALL LETTER replaced by CAPITAL LETTER, then the
	209	-- code is folded to this CAPITAL LETTER code. Otherwise the input code is
	210	-- returned unchanged.
2754ce81	211
2754ce81	212	end System.UTF_32;