ICU 62.1  62.1
normalizer2.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2009-2013, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: normalizer2.h
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
17 */
18 
19 #ifndef __NORMALIZER2_H__
20 #define __NORMALIZER2_H__
21 
27 #include "unicode/utypes.h"
28 
29 #if !UCONFIG_NO_NORMALIZATION
30 
31 #include "unicode/stringpiece.h"
32 #include "unicode/uniset.h"
33 #include "unicode/unistr.h"
34 #include "unicode/unorm2.h"
35 
37 
38 class ByteSink;
39 
84 public:
89  ~Normalizer2();
90 
102  static const Normalizer2 *
103  getNFCInstance(UErrorCode &errorCode);
104 
116  static const Normalizer2 *
117  getNFDInstance(UErrorCode &errorCode);
118 
130  static const Normalizer2 *
131  getNFKCInstance(UErrorCode &errorCode);
132 
144  static const Normalizer2 *
145  getNFKDInstance(UErrorCode &errorCode);
146 
158  static const Normalizer2 *
159  getNFKCCasefoldInstance(UErrorCode &errorCode);
160 
182  static const Normalizer2 *
183  getInstance(const char *packageName,
184  const char *name,
185  UNormalization2Mode mode,
186  UErrorCode &errorCode);
187 
199  normalize(const UnicodeString &src, UErrorCode &errorCode) const {
200  UnicodeString result;
201  normalize(src, result, errorCode);
202  return result;
203  }
217  virtual UnicodeString &
218  normalize(const UnicodeString &src,
219  UnicodeString &dest,
220  UErrorCode &errorCode) const = 0;
221 
246  virtual void
247  normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
248  Edits *edits, UErrorCode &errorCode) const;
249 
264  virtual UnicodeString &
265  normalizeSecondAndAppend(UnicodeString &first,
266  const UnicodeString &second,
267  UErrorCode &errorCode) const = 0;
282  virtual UnicodeString &
283  append(UnicodeString &first,
284  const UnicodeString &second,
285  UErrorCode &errorCode) const = 0;
286 
300  virtual UBool
301  getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
302 
327  virtual UBool
328  getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
329 
345  virtual UChar32
346  composePair(UChar32 a, UChar32 b) const;
347 
356  virtual uint8_t
357  getCombiningClass(UChar32 c) const;
358 
373  virtual UBool
374  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
396  virtual UBool
397  isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
398 
399 
416  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
417 
440  virtual int32_t
441  spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
442 
456  virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
457 
472  virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
473 
487  virtual UBool isInert(UChar32 c) const = 0;
488 };
489 
502 public:
513  FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
514  norm2(n2), set(filterSet) {}
515 
521 
535  virtual UnicodeString &
536  normalize(const UnicodeString &src,
537  UnicodeString &dest,
538  UErrorCode &errorCode) const U_OVERRIDE;
539 
564  virtual void
565  normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
566  Edits *edits, UErrorCode &errorCode) const U_OVERRIDE;
567 
582  virtual UnicodeString &
584  const UnicodeString &second,
585  UErrorCode &errorCode) const U_OVERRIDE;
600  virtual UnicodeString &
601  append(UnicodeString &first,
602  const UnicodeString &second,
603  UErrorCode &errorCode) const U_OVERRIDE;
604 
616  virtual UBool
617  getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
618 
630  virtual UBool
631  getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
632 
643  virtual UChar32
644  composePair(UChar32 a, UChar32 b) const U_OVERRIDE;
645 
654  virtual uint8_t
655  getCombiningClass(UChar32 c) const U_OVERRIDE;
656 
668  virtual UBool
669  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
691  virtual UBool
692  isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE;
705  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
717  virtual int32_t
718  spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
719 
728  virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE;
729 
738  virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE;
739 
747  virtual UBool isInert(UChar32 c) const U_OVERRIDE;
748 private:
749  UnicodeString &
750  normalize(const UnicodeString &src,
751  UnicodeString &dest,
752  USetSpanCondition spanCondition,
753  UErrorCode &errorCode) const;
754 
755  void
756  normalizeUTF8(uint32_t options, const char *src, int32_t length,
757  ByteSink &sink, Edits *edits,
758  USetSpanCondition spanCondition,
759  UErrorCode &errorCode) const;
760 
761  UnicodeString &
763  const UnicodeString &second,
764  UBool doNormalize,
765  UErrorCode &errorCode) const;
766 
767  const Normalizer2 &norm2;
768  const UnicodeSet &set;
769 };
770 
772 
773 #endif // !UCONFIG_NO_NORMALIZATION
774 #endif // __NORMALIZER2_H__
icu::Normalizer2::getRawDecomposition
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const
Gets the raw decomposition mapping of c.
icu::Normalizer2::getDecomposition
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const =0
Gets the decomposition mapping of c.
icu::Normalizer2::spanQuickCheckYes
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const =0
Returns the end of the normalized substring of the input string.
UNormalization2Mode
UNormalization2Mode
Constants for normalization modes.
Definition: unorm2.h:45
utypes.h
Basic definitions for ICU, for both C and C++ APIs.
icu::UnicodeSet
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:278
UBool
int8_t UBool
The ICU boolean type.
Definition: umachine.h:236
icu::Normalizer2::hasBoundaryAfter
virtual UBool hasBoundaryAfter(UChar32 c) const =0
Tests if the character always has a normalization boundary after it, regardless of context.
U_COMMON_API
#define U_COMMON_API
Definition: utypes.h:359
USetSpanCondition
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:152
icu::FilteredNormalizer2::FilteredNormalizer2
FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet)
Constructs a filtered normalizer wrapping any Normalizer2 instance and a filter set.
Definition: normalizer2.h:513
stringpiece.h
C++ API: StringPiece: Read-only byte string wrapper class.
icu::UnicodeString
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:286
UChar32
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:400
icu::UObject
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:223
UErrorCode
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers,...
Definition: utypes.h:396
icu::Edits
Records lengths of string edits but not replacement text.
Definition: edits.h:77
icu::Normalizer2::isNormalizedUTF8
virtual UBool isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const
Tests if the UTF-8 string is normalized.
icu::ByteSink
A ByteSink can be filled with bytes.
Definition: bytestream.h:50
icu::Normalizer2
Unicode normalization functionality for standard Unicode normalization or for using custom mapping ta...
Definition: normalizer2.h:83
icu::Normalizer2::quickCheck
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
icu::Normalizer2::append
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the second string to the first string (merging them at the boundary) and returns the first st...
UNormalizationCheckResult
UNormalizationCheckResult
Result values for normalization quick check functions.
Definition: unorm2.h:94
unorm2.h
C API: New API for Unicode Normalization.
icu::Normalizer2::isInert
virtual UBool isInert(UChar32 c) const =0
Tests if the character is normalization-inert.
icu::Normalizer2::normalizeSecondAndAppend
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
icu::Normalizer2::hasBoundaryBefore
virtual UBool hasBoundaryBefore(UChar32 c) const =0
Tests if the character always has a normalization boundary before it, regardless of context.
icu::Normalizer2::getCombiningClass
virtual uint8_t getCombiningClass(UChar32 c) const
Gets the combining class of c.
icu::Normalizer2::isNormalized
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
icu::Normalizer2::composePair
virtual UChar32 composePair(UChar32 a, UChar32 b) const
Performs pairwise composition of a & b and returns the composite if there is one.
uniset.h
C++ API: Unicode Set.
U_NAMESPACE_END
#define U_NAMESPACE_END
Definition: uversion.h:138
icu::Normalizer2::normalizeUTF8
virtual void normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) const
Normalizes a UTF-8 string and optionally records how source substrings relate to changed and unchange...
U_NAMESPACE_BEGIN
#define U_NAMESPACE_BEGIN
Definition: uversion.h:137
icu::StringPiece
A string-like object that points to a sized piece of memory.
Definition: stringpiece.h:54
icu::Normalizer2::normalize
UnicodeString normalize(const UnicodeString &src, UErrorCode &errorCode) const
Returns the normalized form of the source string.
Definition: normalizer2.h:199
unistr.h
C++ API: Unicode String.
icu::FilteredNormalizer2
Normalization filtered by a UnicodeSet.
Definition: normalizer2.h:501