summaryrefslogtreecommitdiff
path: root/gcc/ada/a-stuten.ads
blob: b8c09a631db4219360633cc39ce36b3aacd8fde4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
------------------------------------------------------------------------------
--                                                                          --
--                         GNAT RUN-TIME COMPONENTS                         --
--                                                                          --
--              A D A . S T R I N G S . U T F _ E N C O D I N G             --
--                                                                          --
--                                 S p e c                                  --
--                                                                          --
-- This specification is derived from the Ada Reference Manual for use with --
-- GNAT. The copyright notice above, and the license provisions that follow --
-- apply solely to the  contents of the part following the private keyword. --
--                                                                          --
-- GNAT is free software;  you can  redistribute it  and/or modify it under --
-- terms of the  GNU General Public License as published  by the Free Soft- --
-- ware  Foundation;  either version 2,  or (at your option) any later ver- --
-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License --
-- for  more details.  You should have  received  a copy of the GNU General --
-- Public License  distributed with GNAT;  see file COPYING.  If not, write --
-- to  the  Free Software Foundation,  51  Franklin  Street,  Fifth  Floor, --
-- Boston, MA 02110-1301, USA.                                              --
--                                                                          --
-- As a special exception,  if other files  instantiate  generics from this --
-- unit, or you link  this unit with other files  to produce an executable, --
-- this  unit  does not  by itself cause  the resulting  executable  to  be --
-- covered  by the  GNU  General  Public  License.  This exception does not --
-- however invalidate  any other reasons why  the executable file  might be --
-- covered by the  GNU Public License.                                      --
--                                                                          --
-- GNAT was originally developed  by the GNAT team at  New York University. --
-- Extensive contributions were provided by Ada Core Technologies Inc.      --
--                                                                          --
------------------------------------------------------------------------------

--  This is one of the Ada 2012 package defined in AI05-0137-1. It is a parent
--  package that contains declarations used in the child packages for handling
--  UTF encoded strings. Note: this package is consistent with Ada 95, and may
--  be used in Ada 95 or Ada 2005 mode.

with Interfaces;
with Unchecked_Conversion;

package Ada.Strings.UTF_Encoding is
   pragma Pure (UTF_Encoding);

   subtype UTF_String is String;
   --  Used to represent a string of 8-bit values containing a sequence of
   --  values encoded in one of three ways (UTF-8, UTF-16BE, or UTF-16LE).
   --  Typically used in connection with a Scheme parameter indicating which
   --  of the encodings applies. This is not strictly a String value in the
   --  sense defined in the Ada RM, but in practice type String accommodates
   --  all possible 256 codes, and can be used to hold any sequence of 8-bit
   --  codes. We use String directly rather than create a new type so that
   --  all existing facilities for manipulating type String (e.g. the child
   --  packages of Ada.Strings) are available for manipulation of UTF_Strings.

   type Encoding_Scheme is (UTF_8, UTF_16BE, UTF_16LE);
   --  Used to specify which of three possible encodings apply to a UTF_String

   subtype UTF_8_String is String;
   --  Similar to UTF_String but specifically represents a UTF-8 encoded string

   subtype UTF_16_Wide_String is Wide_String;
   --  This is similar to UTF_8_String but is used to represent a Wide_String
   --  value which is a sequence of 16-bit values encoded using UTF-16. Again
   --  this is not strictly a Wide_String in the sense of the Ada RM, but the
   --  type Wide_String can be used to represent a sequence of arbitrary 16-bit
   --  values, and it is more convenient to use Wide_String than a new type.

   Encoding_Error : exception;
   --  This exception is raised in the following situations:
   --    a) A UTF encoded string contains an invalid encoding sequence
   --    b) A UTF-16BE or UTF-16LE input string has an odd length
   --    c) An incorrect character value is present in the Input string
   --    d) The result for a Wide_Character output exceeds 16#FFFF#
   --  The exception message has the index value where the error occurred.

   --  The BOM (BYTE_ORDER_MARK) values defined here are used at the start of
   --  a string to indicate the encoding. The convention in this package is
   --  that on input a correct BOM is ignored and an incorrect BOM causes an
   --  Encoding_Error exception. On output, the output string may or may not
   --  include a BOM depending on the setting of Output_BOM.

   BOM_8    : constant UTF_8_String :=
                Character'Val (16#EF#) &
                Character'Val (16#BB#) &
                Character'Val (16#BF#);

   BOM_16BE : constant UTF_String :=
                Character'Val (16#FE#) &
                Character'Val (16#FF#);

   BOM_16LE : constant UTF_String :=
                Character'Val (16#FF#) &
                Character'Val (16#FE#);

   BOM_16   : constant UTF_16_Wide_String :=
                (1 => Wide_Character'Val (16#FEFF#));

   function Encoding
     (Item    : UTF_String;
      Default : Encoding_Scheme := UTF_8) return Encoding_Scheme;
   --  This function inspects a UTF_String value to determine whether it
   --  starts with a BOM for UTF-8, UTF-16BE, or UTF_16LE. If so, the result
   --  is the scheme corresponding to the BOM. If no valid BOM is present
   --  then the result is the specified Default value.

private
   function To_Unsigned_8 is new
     Unchecked_Conversion (Character, Interfaces.Unsigned_8);

   function To_Unsigned_16 is new
     Unchecked_Conversion (Wide_Character, Interfaces.Unsigned_16);

   function To_Unsigned_32 is new
     Unchecked_Conversion (Wide_Wide_Character, Interfaces.Unsigned_32);

   subtype UTF_XE_Encoding is Encoding_Scheme range UTF_16BE .. UTF_16LE;
   --  Subtype containing only UTF_16BE and UTF_16LE entries

   --  Utility routines for converting between UTF-16 and UTF-16LE/BE

   function From_UTF_16
     (Item          : UTF_16_Wide_String;
      Output_Scheme : UTF_XE_Encoding;
      Output_BOM    : Boolean := False) return UTF_String;
   --  The input string Item is encoded in UTF-16. The output is encoded using
   --  Output_Scheme (which is either UTF-16LE or UTF-16BE). There are no error
   --  cases. The output starts with BOM_16BE/LE if Output_BOM is True.

   function To_UTF_16
     (Item          : UTF_String;
      Input_Scheme  : UTF_XE_Encoding;
      Output_BOM    : Boolean := False) return UTF_16_Wide_String;
   --  The input string Item is encoded using Input_Scheme which is either
   --  UTF-16LE or UTF-16BE. The output is the corresponding UTF_16 wide
   --  string. Encoding error is raised if the length of the input is odd.
   --  The output starts with BOM_16 if Output_BOM is True.

   procedure Raise_Encoding_Error (Index : Natural);
   pragma No_Return (Raise_Encoding_Error);
   --  Raise Encoding_Error exception for bad encoding in input item. The
   --  parameter Index is the index of the location in Item for the error.

end Ada.Strings.UTF_Encoding;