commit dfce8a98479c80a98ce8c1017394c7f2d93e4814
parent c88e97c7aac0df9bdb9664c7bb9c1a72cfab277e
Author: robert <robertrussell.72001@gmail.com>
Date: Wed, 10 Aug 2022 18:56:19 -0700
Add function to get unicode category of given rune
Much better than using ICU or some other awful library.
Diffstat:
7 files changed, 172 insertions(+), 1 deletion(-)
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,4 @@
*.o
*.a
+gen/*
+tool/ucattab
diff --git a/Makefile b/Makefile
@@ -8,6 +8,7 @@ SRC =\
src/log.c\
src/opt.c\
src/str.c\
+ src/unicode.c\
src/utf8.c
libcext.a: $(SRC:.c=.o)
@@ -21,9 +22,22 @@ src/bench.o: src/bench.c inc/cext/bench.h inc/cext/cext.h inc/cext/def.h inc/cex
src/log.o: src/log.c inc/cext/cext.h inc/cext/def.h inc/cext/log.h config.mk
src/opt.o: src/opt.c inc/cext/cext.h inc/cext/def.h inc/cext/opt.h config.mk
src/str.o: src/str.c inc/cext/alloc.h inc/cext/cext.h inc/cext/def.h inc/cext/log.h inc/cext/str.h config.mk
+src/unicode.o: src/unicode.c inc/cext/cext.h inc/cext/def.h gen/ucattab.inc config.mk
src/utf8.o: src/utf8.c inc/cext/cext.h inc/cext/def.h inc/cext/utf8.h config.mk
+gen/ucattab.inc: gen tool/ucattab gen/UnicodeData.txt
+ tool/ucattab gen/UnicodeData.txt > $@
+
+gen/UnicodeData.txt: gen
+ wget -q -O $@ $(UNICODE_DATA_URL)
+
+gen:
+ mkdir gen
+
+tool/ucattab: tool/ucattab.c src/alloc.o src/log.o src/str.o
+ $(CC) -o $@ $(CFLAGS) $^
+
clean:
- rm -f libcext.a $(SRC:.c=.o)
+ rm -rf libcext.a $(SRC:.c=.o) gen
.PHONY: clean
diff --git a/config.mk b/config.mk
@@ -1,3 +1,5 @@
CC = cc
CFLAGS = -std=c11 -O2 -Wall -pedantic -Iinc
AR = ar
+
+UNICODE_DATA_URL = https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
diff --git a/inc/cext/all.h b/inc/cext/all.h
@@ -5,5 +5,6 @@
#include "cext/log.h"
#include "cext/opt.h"
#include "cext/str.h"
+#include "cext/unicode.h"
#include "cext/utf8.h"
#include "cext/vector.h"
diff --git a/inc/cext/unicode.h b/inc/cext/unicode.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "cext/def.h"
+
+char *cext_unicode_category(rune r);
diff --git a/src/unicode.c b/src/unicode.c
@@ -0,0 +1,36 @@
+#include "cext/cext.h"
+#include "../gen/ucattab.inc"
+
+static char ucats[] =
+ "Lu\0Ll\0Lt\0Lm\0Lo\0"
+ "Mn\0Mc\0Me\0"
+ "Nd\0Nl\0No\0"
+ "Pc\0Pd\0Ps\0Pe\0Pi\0Pf\0Po\0"
+ "Sm\0Sc\0Sk\0So\0"
+ "Zs\0Zl\0Zp\0"
+ "Cc\0Cf\0Cs\0Co\0Cn";
+
+char *
+cext_unicode_category(rune r) {
+ if (r <= 0xff) /* Latin 1 */
+ return &ucats[3 * ucatl1tab[r]];
+
+ if (r > RUNE_MAX)
+ return 0;
+
+ /* Binary search ucattab */
+ usize l = 0;
+ usize u = LEN(ucattab);
+ while (l < u) {
+ usize i = l + (u-l)/2;
+ u32 entry = ucattab[i];
+ if (r < (entry & 0xffffff))
+ u = i;
+ else if (i + 1 < LEN(ucattab) && r >= (ucattab[i+1] & 0xffffff))
+ l = i + 1;
+ else
+ return &ucats[3 * (entry >> 24)];
+ }
+
+ return 0;
+}
diff --git a/tool/ucattab.c b/tool/ucattab.c
@@ -0,0 +1,111 @@
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "cext/alloc.h"
+#include "cext/cext.h"
+#include "cext/log.h"
+#include "cext/str.h"
+
+#define NF 15 /* Number of fields in UnicodeData.txt */
+
+u8
+cattoi(char *cat) {
+ static char ucats[] =
+ "Lu\0Ll\0Lt\0Lm\0Lo\0"
+ "Mn\0Mc\0Me\0"
+ "Nd\0Nl\0No\0"
+ "Pc\0Pd\0Ps\0Pe\0Pi\0Pf\0Po\0"
+ "Sm\0Sc\0Sk\0So\0"
+ "Zs\0Zl\0Zp\0"
+ "Cc\0Cf\0Cs\0Co\0Cn";
+ for (usize i = 0; i < LEN(ucats); i += 3) {
+ if (!strcmp(cat, &ucats[i]))
+ return i / 3;
+ }
+ fatalf("bad category '%s'", cat);
+ return 0; /* Suppress warning */
+}
+
+u8 *
+parse_cats(char *filename) {
+ FILE *f = fopen(filename, "rb");
+ if (!f) fatalf("fopen: %s", strerror(errno));
+
+ u8 *cats = ealloc(RUNE_MAX + 1);
+
+ char line[512];
+ bool inrange = false;
+ i32 prevcp = -1;
+ while (fgets(line, sizeof line, f)) {
+ char *nl = strchr(line, '\n');
+ if (!nl) fatalf("line too long");
+ *nl = '\0';
+
+ char **fields = (char *[NF]){0};
+ if (cext_str_split(&fields, line, ";", NF) != NF)
+ fatalf("line has too few fields");
+ i32 cp = strtol(fields[0], 0, 16);
+ char *name = fields[1];
+ u8 cat = cattoi(!strcmp(fields[2], "") ? "Cn" : fields[2]);
+
+ /* We expect UnicodeData.txt to be sorted, but I can't find any
+ * guarantee of that in UAX #44. */
+ assert(cp > prevcp);
+ assert(!inrange || cext_str_ends_with(name, "Last>"));
+
+ for (i32 c = prevcp+1; c <= cp; c++)
+ cats[c] = inrange || c == cp ? cat : cattoi("Cn");
+
+ inrange = cext_str_ends_with(name, "First>");
+ prevcp = cp;
+ }
+ if (!feof(f)) fatalf("fgets: %s", strerror(errno));
+
+ for (i32 c = prevcp+1; c <= RUNE_MAX; c++)
+ cats[c] = cattoi("Cn");
+
+ fclose(f);
+
+ return cats;
+}
+
+int
+main(int argc, char **argv) {
+ if (argc != 2)
+ fatalf("usage: %s UNICODE_DATA_FILE", argv[0]);
+
+ u8 *cats = parse_cats(argv[1]);
+
+ printf(
+ "/* This file is automatically generated. */\n"
+ "\n"
+ "/* Special table to optimize for Latin 1 */\n"
+ "static u8 ucatl1tab[] = {"
+ );
+
+ for (rune i = 0; i <= 0xff; i++)
+ printf("%s0x%02x,", i%8 == 0 ? "\n\t" : " ", cats[i]);
+
+ printf(
+ "\n}; /* 256 bytes */\n"
+ "\n"
+ "/* The high byte is the category, and the low three bytes are the\n"
+ " * codepoint. Codepoints are listed in order, with consecutive\n"
+ " * entries with the same category compressed into one entry. */\n"
+ "static u32 ucattab[] = {"
+ );
+
+ usize tablen = 0;
+ for (rune i = 0x100; i <= RUNE_MAX; i++) {
+ if (i > 0x100 && cats[i] == cats[i-1])
+ continue;
+ printf("%s0x%08"PRIx32"ul,", tablen%4 == 0 ? "\n\t" : " ",
+ ((rune) cats[i] << 24) | i);
+ tablen++;
+ }
+
+ printf("\n}; /* %zd bytes */\n", 4 * tablen);
+}