Module:Unicode data/category
Appearance
Unicode General Category data derived from DerivedGeneralCategory.txt in the Unicode Character Database.
The data was generated by the two Lua 5.3 scripts below. LPeg is required. If the two scripts are in the same folder as DerivedGeneralCategory.txt, all one has to do is go to the directory with the command line and type lua print_data.lua to print the data to data.lua.
| Lua 5.3 scripts |
|---|
local f = assert(io.open 'DerivedGeneralCategory.txt', 'r')
local Derived_General_Category = f:read 'a'
f:close()
local lpeg = require 'lpeg'
for k, v in pairs(lpeg) do
if type(k) == 'string' then
local first_letter = k:sub(1, 1)
if first_letter == first_letter:upper() then
_ENV[k] = v
end
end
end
local General_Category_data = { singles = {}, ranges = {} }
local function process_match(str, pos, ...)
if select(3, ...) then -- three arguments: XXXX..XXXX ; gc
local low, high, category = ...
if category ~= 'Cn' then
low, high = tonumber(low, 16), tonumber(high, 16)
table.insert(General_Category_data.ranges, { low, high, category })
end
else -- two arguments: XXXX ; gc
local codepoint, category = ...
if category ~= 'Cn' then
codepoint = tonumber(codepoint, 16)
General_Category_data.singles[codepoint] = category
end
end
return pos
end
local patt = P {
(V 'line' + 1)^1,
line = Cmt((V 'range' + C(V 'codepoint')) * V 'white' * P ';' * V 'white' * C(V 'gc') * (1 - V 'nl')^0,
process_match),
range = C(V 'codepoint') * P '..' * C(V 'codepoint'),
codepoint = V 'hex' * V 'hex' * V 'hex' * V 'hex' * V 'hex'^-2,
gc = R 'AZ' * P(1),
hex = R("09", "AF"),
white = S ' \t'^0,
nl = P '\r'^-1 * P '\n',
}
patt:match(Derived_General_Category)
return General_Category_data
local data_filename = [[make_data.lua]]
local data = dofile(data_filename)
local output_filename = [[data.lua]]
local output = assert(io.open(output_filename, 'w'))
local function writef(...)
output:write(string.format(...))
end
writef [[
return {
singles = {
]]
-- Check that maximum "singles" codepoint is less than 0x100000?
for codepoint, category in require 't'.spairs(data.singles) do
writef('\t\t [0x%05X] = "%s",\n', codepoint, category)
end
writef [[
},
ranges = {
]]
local function compare_ranges(range1, range2)
return range1[1] < range2[1]
end
table.sort(data.ranges, compare_ranges)
for _, range in ipairs(data.ranges) do
writef('\t\t{ 0x%06X, 0x%06X, "%s" },\n', table.unpack(range))
end
writef [[
},
}]]
|
-- [[:commons:Data:Unicode/data/category/singles.tab]]
-- [[:commons:Data:Unicode/data/category/ranges.tab]]
-- [[:commons:Data:Unicode/data/category/names.tab]]
local function get_result()
local write_index
local result={}
-- singles
result.singles={}
local data=mw.ext.data.get("Unicode/data/category/singles.tab")
for index, cols in ipairs(data.data) do
if cols[1] and cols[2] then
if cols[2] ~= "Cn" then
result.singles[tonumber(cols[1], 16)]=cols[2]
end
end
end
-- ranges
data=mw.ext.data.get("Unicode/data/category/ranges.tab")
write_index=1
result.ranges={}
for index, cols in ipairs(data.data) do
if cols[1] and cols[2] and cols[3] then
if cols[3] ~= "Cn" then
result.ranges[write_index]={tonumber(cols[1], 16), tonumber(cols[2], 16), cols[3]}
write_index=write_index+1
end
end
end
-- long_names
data=mw.ext.data.get("Unicode/data/category/names.tab")
result.long_names={}
for index, cols in ipairs(data.data) do
if cols[1] and cols[2] then
result.long_names[cols[1]]=cols[2]
end
end
return result
end
return get_result()