You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

188 lines
4.3 KiB
C

// SPDX-License-Identifier: LGPL-3.0-or-later
/**
* \file utf/rune.c
*
* Implements `char`s to `Rune` conversion functions and back.
*
* \copyright The DoubleFourteen Code Forge (C) All Rights Reserved
* \author Lorenzo Cogotti
*/
/*
* This work is derived from Plan 9 libutf, see utf/utf.h for details.
* Original license terms follow:
*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*
* The original libutf library is available at: https://9fans.github.io/plan9port/unix/libutf.tgz
*/
#include "utf/utf.h"
#define BIT1 7
#define BITX 6
#define BIT2 5
#define BIT3 4
#define BIT4 3
#define BIT5 2
#define T1 (((1u << (BIT1+1)) - 1u) ^ 0xffu) /* 0000 0000 */
#define TX (((1u << (BITX+1)) - 1u) ^ 0xffu) /* 1000 0000 */
#define T2 (((1u << (BIT2+1)) - 1u) ^ 0xffu) /* 1100 0000 */
#define T3 (((1u << (BIT3+1)) - 1u) ^ 0xffu) /* 1110 0000 */
#define T4 (((1u << (BIT4+1)) - 1u) ^ 0xffu) /* 1111 0000 */
#define T5 (((1u << (BIT5+1)) - 1u) ^ 0xffu) /* 1111 1000 */
#define RUNE1 ((U32_C(1) << (BIT1 + 0*BITX)) - 1u) /* 0000 0000 0000 0000 0111 1111 */
#define RUNE2 ((U32_C(1) << (BIT2 + 1*BITX)) - 1u) /* 0000 0000 0000 0111 1111 1111 */
#define RUNE3 ((U32_C(1) << (BIT3 + 2*BITX)) - 1u) /* 0000 0000 1111 1111 1111 1111 */
#define RUNE4 ((U32_C(1) << (BIT4 + 3*BITX)) - 1u) /* 0011 1111 1111 1111 1111 1111 */
#define MASKX ((1u << BITX) - 1u) /* 0011 1111 */
#define TESTX (MASKX ^ 0xFFu) /* 1100 0000 */
size_t chartorune(Rune *dest, const char *str)
{
// One character sequence
// 00000-0007F => T1
Rune c = *(const Uint8 *) str;
if (c < TX) {
*dest = c;
return 1;
}
// Two character sequence
// 0080-07FF => T2 Tx
Rune c1 = *(const Uint8 *) (str+1) ^ TX;
if (c1 & TESTX)
goto bad;
if (c < T3) {
if (c < T2)
goto bad;
Rune r = ((c << BITX) | c1) & RUNE2;
if (r <= RUNE1)
goto bad;
*dest = r;
return 2;
}
// Three character sequence
// 0800-FFFF => T3 Tx Tx
Rune c2 = *(const Uint8 *) (str+2) ^ TX;
if (c2 & TESTX)
goto bad;
if (c < T4) {
Rune r = ((((c << BITX) | c1) << BITX) | c2) & RUNE3;
if (r <= RUNE2)
goto bad;
*dest = r;
return 3;
}
// Four character sequence
// 10000-10FFFF => T4 Tx Tx Tx
Rune c3 = *(const Uint8*) (str+3) ^ TX;
if (c3 & TESTX)
goto bad;
if (c < T5) {
Rune r = ((((((c << BITX) | c1) << BITX) | c2) << BITX) | c3) & RUNE4;
if (r <= RUNE3)
goto bad;
if (r > MAXRUNE)
goto bad;
*dest = r;
return 4;
}
// Decoding error
bad:
*dest = RUNE_ERR;
return 1;
}
size_t runetochar(char *dest, Rune r)
{
// One character sequence
// 00000-0007F => 00-7F
if (r <= RUNE1) {
dest[0] = r;
return 1;
}
// Two character sequence
// 00080-007FF => T2 Tx
if (r <= RUNE2) {
dest[0] = T2 | (r >> 1*BITX);
dest[1] = TX | (r & MASKX);
return 2;
}
// Three character sequence
// 00800-0FFFF => T3 Tx Tx
if (r > MAXRUNE)
r = RUNE_ERR;
if (r <= RUNE3) {
dest[0] = T3 | (r >> 2*BITX);
dest[1] = TX | ((r >> 1*BITX) & MASKX);
dest[2] = TX | (r & MASKX);
return 3;
}
// Four character sequence
// 010000-1FFFFF => T4 Tx Tx Tx
dest[0] = T4 | (r >> 3*BITX);
dest[1] = TX | ((r >> 2*BITX) & MASKX);
dest[2] = TX | ((r >> 1*BITX) & MASKX);
dest[3] = TX | (r & MASKX);
return 4;
}
size_t runelen(Rune r)
{
return 1 + (r > RUNE1) + (r > RUNE2) + (r > RUNE3 && r <= MAXRUNE);
}
size_t runenlen(const Rune *r, size_t n)
{
size_t nb = 0;
while (n--)
nb += runelen(*r++);
return nb;
}
Boolean fullrune(const char *str, size_t n)
{
if (n == 0)
return FALSE;
Rune c = *(const Uint8 *) str;
if (c < TX)
return TRUE;
if (c < T3)
return n >= 2;
if (c < T4)
return n >= 3;
return n >= 4;
}