Module:User:Erutuon/grapheme cluster break

local char = mw.ustring.char

local data = mw.loadJsonData("Module:User:Erutuon/grapheme cluster break.json")

local Grapheme_Break_Property = data.Grapheme_Break_Property local Extended_Pictographic = data.Extended_Pictographic local Indic_Conjunct_Break = data.Indic_Conjunct_Break local integer_to_property_name = data.integer_to_property local p = data.property_to_integer

local Start_State = 0 local Extended_Pictographic_ZWJ = require "Module:table".length(integer_to_property_name) + 1

local floor = math.floor local function binary_search_ranges(codepoint, data) local low, mid, high low, high = 1, data.length while low <= high do		mid = floor((low + high) / 2) local range = data.ranges[mid] local property_value, first, last = range[1], range[2], range[3] last = last or first if codepoint < first then high = mid - 1 elseif codepoint <= last then return property_value else low = mid + 1 end end end

local function get_extended_pictographic_property(codepoint) return binary_search_ranges(codepoint, Extended_Pictographic) end

local function get_indic_conjunct_break_property(codepoint) return binary_search_ranges(codepoint, Indic_Conjunct_Break) end

local function get_grapheme_break_property(codepoint) return binary_search_ranges(codepoint, Grapheme_Break_Property) end

local function get_multi_property(codepoint) return get_grapheme_break_property(codepoint) or get_extended_pictographic_property(codepoint) or p.Other end

local function maybe_class_to_string(maybe_class) return tostring(integer_to_property_name[maybe_class]) end

local function grapheme_break_simple(last, current) if last == Start_State then											 -- GB1 return true elseif last == p.CR and current == p.LF then							 -- GB3 return false elseif (last == p.CR or last == p.LF or last == p.Control)			  -- GB4 or (current == p.CR or current == p.LF or current == p.Control) then -- GB5 return true elseif last == p.L and (												 -- GB6			current == p.L or current == p.V or current == p.LV or current == p.LVT		) then return false elseif (last == p.LV or last == p.V) -- GB7 and (current == p.V or current == p.T) then return false elseif (last == p.LVT or last == p.T) -- GB8 and current == p.T then return false elseif current == p.Extend or current == p.ZWJ									  -- GB9 or current == p.SpacingMark													 -- GB9a or last == p.Prepend then														-- GB9b return false -- elseif last == p.InCB_Linker and current == p.InCB_Consonant then					-- GB9c --	 return false elseif last == Extended_Pictographic_ZWJ and current == p.Extended_Pictographic then -- GB11 (partial) return false elseif last == p.Regional_Indicator and current == p.Regional_Indicator then		 -- GB12, GB13 (partial) return false else return true end end

local function grapheme_break_extended_with_state local state = Start_State local indic_state local function grapheme_break_extended(first_codepoint, second_codepoint) local last_class, current_class = get_multi_property(first_codepoint), get_multi_property(second_codepoint) local first_indic_class, second_indic_class = get_indic_conjunct_break_property(first_codepoint), get_indic_conjunct_break_property(second_codepoint) local last_override if state == Start_State then state = last_class last_override = last_class else last_override = state end local break_permitted = grapheme_break_simple(last_override, current_class) if first_indic_class == p.InCB_Consonant then indic_state = first_indic_class elseif first_indic_class == p.InCB_Extend then if indic_state == p.InCB_Consonant or indic_state == p.InCB_Extend then indic_state = p.InCB_Extend elseif indic_state == p.InCB_Linker then indic_state = p.InCB_Linker end elseif first_indic_class == p.InCB_Linker then if indic_state ~= nil then indic_state = p.InCB_Linker end else indic_state = nil end if indic_state == p.InCB_Linker and second_indic_class == p.InCB_Consonant then break_permitted = false end if state == current_class and current_class == p.Regional_Indicator then state = p.Other elseif state == p.Extended_Pictographic then if current_class == p.Extend then state = p.Extended_Pictographic  -- fold EXTEND codepoints into emoji elseif current_class == p.ZWJ then state = Extended_Pictographic_ZWJ -- state to record emoji+zwg combo else state = current_class end else state = current_class end return break_permitted end return grapheme_break_extended end

return grapheme_break_extended_with_state