ICU 74.2  74.2
normalizer2.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2009-2013, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: normalizer2.h
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
17 */
18 
19 #ifndef __NORMALIZER2_H__
20 #define __NORMALIZER2_H__
21 
27 #include "unicode/utypes.h"
28 
29 #if U_SHOW_CPLUSPLUS_API
30 
31 #if !UCONFIG_NO_NORMALIZATION
32 
33 #include "unicode/stringpiece.h"
34 #include "unicode/uniset.h"
35 #include "unicode/unistr.h"
36 #include "unicode/unorm2.h"
37 
38 U_NAMESPACE_BEGIN
39 
40 class ByteSink;
41 
86 public:
91  ~Normalizer2();
92 
104  static const Normalizer2 *
105  getNFCInstance(UErrorCode &errorCode);
106 
118  static const Normalizer2 *
119  getNFDInstance(UErrorCode &errorCode);
120 
132  static const Normalizer2 *
133  getNFKCInstance(UErrorCode &errorCode);
134 
146  static const Normalizer2 *
147  getNFKDInstance(UErrorCode &errorCode);
148 
163  static const Normalizer2 *
164  getNFKCCasefoldInstance(UErrorCode &errorCode);
165 
166 #ifndef U_HIDE_DRAFT_API
167 
181  static const Normalizer2 *
182  getNFKCSimpleCasefoldInstance(UErrorCode &errorCode);
183 #endif // U_HIDE_DRAFT_API
184 
206  static const Normalizer2 *
207  getInstance(const char *packageName,
208  const char *name,
209  UNormalization2Mode mode,
210  UErrorCode &errorCode);
211 
223  normalize(const UnicodeString &src, UErrorCode &errorCode) const {
224  UnicodeString result;
225  normalize(src, result, errorCode);
226  return result;
227  }
241  virtual UnicodeString &
242  normalize(const UnicodeString &src,
243  UnicodeString &dest,
244  UErrorCode &errorCode) const = 0;
245 
268  virtual void
269  normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
270  Edits *edits, UErrorCode &errorCode) const;
271 
286  virtual UnicodeString &
287  normalizeSecondAndAppend(UnicodeString &first,
288  const UnicodeString &second,
289  UErrorCode &errorCode) const = 0;
304  virtual UnicodeString &
305  append(UnicodeString &first,
306  const UnicodeString &second,
307  UErrorCode &errorCode) const = 0;
308 
322  virtual UBool
323  getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
324 
349  virtual UBool
350  getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
351 
367  virtual UChar32
368  composePair(UChar32 a, UChar32 b) const;
369 
378  virtual uint8_t
379  getCombiningClass(UChar32 c) const;
380 
395  virtual UBool
396  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
416  virtual UBool
417  isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
418 
419 
436  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
437 
460  virtual int32_t
461  spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
462 
476  virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
477 
492  virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
493 
507  virtual UBool isInert(UChar32 c) const = 0;
508 };
509 
522 public:
533  FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
534  norm2(n2), set(filterSet) {}
535 
541 
555  virtual UnicodeString &
556  normalize(const UnicodeString &src,
557  UnicodeString &dest,
558  UErrorCode &errorCode) const override;
559 
582  virtual void
583  normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
584  Edits *edits, UErrorCode &errorCode) const override;
585 
600  virtual UnicodeString &
602  const UnicodeString &second,
603  UErrorCode &errorCode) const override;
618  virtual UnicodeString &
619  append(UnicodeString &first,
620  const UnicodeString &second,
621  UErrorCode &errorCode) const override;
622 
634  virtual UBool
635  getDecomposition(UChar32 c, UnicodeString &decomposition) const override;
636 
648  virtual UBool
649  getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override;
650 
661  virtual UChar32
662  composePair(UChar32 a, UChar32 b) const override;
663 
672  virtual uint8_t
673  getCombiningClass(UChar32 c) const override;
674 
686  virtual UBool
687  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
707  virtual UBool
708  isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
721  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
733  virtual int32_t
734  spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
735 
744  virtual UBool hasBoundaryBefore(UChar32 c) const override;
745 
754  virtual UBool hasBoundaryAfter(UChar32 c) const override;
755 
763  virtual UBool isInert(UChar32 c) const override;
764 private:
765  UnicodeString &
766  normalize(const UnicodeString &src,
767  UnicodeString &dest,
768  USetSpanCondition spanCondition,
769  UErrorCode &errorCode) const;
770 
771  void
772  normalizeUTF8(uint32_t options, const char *src, int32_t length,
773  ByteSink &sink, Edits *edits,
774  USetSpanCondition spanCondition,
775  UErrorCode &errorCode) const;
776 
777  UnicodeString &
779  const UnicodeString &second,
780  UBool doNormalize,
781  UErrorCode &errorCode) const;
782 
783  const Normalizer2 &norm2;
784  const UnicodeSet &set;
785 };
786 
787 U_NAMESPACE_END
788 
789 #endif // !UCONFIG_NO_NORMALIZATION
790 
791 #endif /* U_SHOW_CPLUSPLUS_API */
792 
793 #endif // __NORMALIZER2_H__
virtual UBool hasBoundaryBefore(UChar32 c) const =0
Tests if the character always has a normalization boundary before it, regardless of context...
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const =0
Returns the end of the normalized substring of the input string.
C++ API: Unicode String.
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const
Gets the raw decomposition mapping of c.
A ByteSink can be filled with bytes.
Definition: bytestream.h:53
virtual uint8_t getCombiningClass(UChar32 c) const
Gets the combining class of c.
UnicodeString normalize(const UnicodeString &src, UErrorCode &errorCode) const
Returns the normalized form of the source string.
Definition: normalizer2.h:223
Records lengths of string edits but not replacement text.
Definition: edits.h:80
C++ API: StringPiece: Read-only byte string wrapper class.
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the second string to the first string (merging them at the boundary) and returns the first st...
Unicode normalization functionality for standard Unicode normalization or for using custom mapping ta...
Definition: normalizer2.h:85
C API: New API for Unicode Normalization.
virtual UBool hasBoundaryAfter(UChar32 c) const =0
Tests if the character always has a normalization boundary after it, regardless of context...
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:435
virtual UChar32 composePair(UChar32 a, UChar32 b) const
Performs pairwise composition of a & b and returns the composite if there is one. ...
FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet)
Constructs a filtered normalizer wrapping any Normalizer2 instance and a filter set.
Definition: normalizer2.h:533
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:285
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:184
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
virtual UBool isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const
Tests if the UTF-8 string is normalized.
UNormalization2Mode
Constants for normalization modes.
Definition: unorm2.h:48
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:415
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const =0
Gets the decomposition mapping of c.
virtual void normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) const
Normalizes a UTF-8 string and optionally records how source substrings relate to changed and unchange...
Basic definitions for ICU, for both C and C++ APIs.
virtual UBool isInert(UChar32 c) const =0
Tests if the character is normalization-inert.
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:300
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:295
A string-like object that points to a sized piece of memory.
Definition: stringpiece.h:60
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:223
Normalization filtered by a UnicodeSet.
Definition: normalizer2.h:521
UNormalizationCheckResult
Result values for normalization quick check functions.
Definition: unorm2.h:97
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition: umachine.h:247
C++ API: Unicode Set.