unicode.h 2.19 KB
Newer Older
1 2 3 4 5 6
#ifndef UNICODE_H
#define UNICODE_H

#include <stddef.h>
#include "simtypes.h"

7 8 9
// Unicode type large enough to hold every single possible Unicode code point.
typedef uint32 utf32;

10 11 12
typedef unsigned char  utf8;
typedef unsigned short utf16;

13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
extern utf32 const UNICODE_NUL;

/**
 * UTF-8 string decoder that can be used to iterate through all code points.
 */
class utf8_decoder_t
{
private:
	// Pointer to UTF-8 formated C string.
	utf8 const *utf8str;
public:
	// Constructs a UTF-8 decoder for the given C string.
	utf8_decoder_t(utf8 const *str);

	/** 
	 * Decodes a Unicode code point from the byte sequence pointed to by buff.
	 * On return buff has been advanced to point at the beginning of the next Unicode code point.
	 * Does not respect NUL terminator character, care should be taken to detect the emmited UNICODE_NUL when decoding C strings to avoid buffer over run errors.
	 * Invalid Unicode sequences are intepreted using ISO-8859-1 and advance buff 1 byte.
	 */
	static utf32 decode(utf8 const *&buff);

	/** 
	 * Decodes a Unicode code point from the byte sequence pointed to by buff.
	 * On return len contains the length of the Unicode character in bytes.
	 * Does not respect NUL terminator character, care should be taken to detect the emmited UNICODE_NUL when decoding C strings to avoid buffer over run errors.
	 * Invalid Unicode sequences are intepreted using ISO-8859-1 with a len of 1.
	 */
	static utf32 decode(utf8 const *const buff, size_t &len);

	/**
	 * Returns true if there are more code points left to decode.
	 * Returns false if at end of string.
	 */
	bool has_next() const;

	/**
	 * Returns the next Unicode code point value in the string.
	 * Returns UNICODE_NUL if has_next returns false.
	 */
	utf32 next();

	/**
	 * Returns the current position of the decoder.
	 * This is a pointer to the next character.
	 */
	utf8 const *get_position();
};

62 63 64 65 66 67 68 69 70 71 72
size_t utf8_get_next_char(const utf8 *text, size_t pos);
sint32 utf8_get_prev_char(const utf8 *text, sint32 pos);

int	utf16_to_utf8(utf16 unicode, utf8 *out);

// returns latin2 or 0 for error
uint8 unicode_to_latin2( utf16 chr );
utf16 latin2_to_unicode( uint8 chr );


#endif