Clean up CodepointWidthDetector (#14396)

My long-term plan is to replace the `CodepointWidth` enum with a simple integer
return value that indicates the amount of columns a codepoint is wide.
This is necessary so that we can return 0 for ZWJs (zero width joiners).

This initial commit represents a cleanup effort around `CodepointWidthDetector`.
Since less code runs faster, this change has the nice side-effect of running
roughly 5-10% faster across the board. It also drops the binary size by ~1.2kB.

## Validation Steps Performed
* `CodepointWidthDetectorTests` passes 
* U+26bf (``"`u{26bf}"`` inside pwsh) is a wide glyph
  in OpenConsole and narrow one in Windows Terminal 
This commit is contained in:
Leonard Hecker
2022-12-01 23:23:25 +01:00
committed by GitHub
parent 62ffa4ba41
commit 4bbe3a388c
14 changed files with 403 additions and 620 deletions

View File

@@ -20,8 +20,8 @@
# significantly smaller, which would provide a performance win on the admittedly
# extremely rare occasion that we should need to regenerate our table.
#
# Invoke as ./Generate-xxx ucd.nounihan.flat.xml -Pack | Out-File -Encoding
# UTF-8 Temporary.cpp
# Invoke this script from the root of this repository as:
# .\tools\Generate-CodepointWidthsFromUCD.ps1 -Path .\path\to\ucd.nounihan.flat.xml -OverridePath .\src\types\unicode_width_overrides.xml -Pack
#
# [1]: https://www.unicode.org/Public/UCD/latest/ucdxml/
# [2]: https://www.unicode.org/reports/tr42/
@@ -43,15 +43,13 @@ Param(
[string]$OverridePath = "overrides.xml",
[switch]$Pack, # Pack tightly based on width
[switch]$NoOverrides, # Do not include overrides
[switch]$Full = $False # Include Narrow codepoints
[switch]$NoOverrides # Do not include overrides
)
Enum CodepointWidth {
Narrow;
Wide;
Ambiguous;
Invalid;
}
# UCD Functions {{{
@@ -82,8 +80,8 @@ Function Get-UCDEntryWidth($entry) {
"W" { [CodepointWidth]::Wide; Return }
"F" { [CodepointWidth]::Wide; Return }
"A" { [CodepointWidth]::Ambiguous; Return }
default { throw "Unexpected East_Asian_Width property" }
}
[CodepointWidth]::Invalid
}
Function Get-UCDEntryFlags($entry) {
@@ -224,20 +222,10 @@ $UCDRepertoire = $InputObject.ucd.repertoire.ChildNodes | Sort-Object {
}
}
If (-not $Full) {
$UCDRepertoire = $UCDRepertoire | Where-Object {
# Select everything Wide/Ambiguous/Full OR Emoji w/ Emoji Presentation
($_.ea -notin "N", "Na", "H") -or ($_.Emoji -eq "Y" -and $_.EPres -eq "Y")
}
}
$ranges = [UnicodeRangeList]::New(1024)
$c = 0
ForEach($v in $UCDRepertoire) {
$range = [UnicodeRange]::new($v)
$c += $range.Length()
If ($ranges.Count -gt 0 -and $ranges[$ranges.Count - 1].Merge($range)) {
# Merged into last entry
Continue
@@ -260,9 +248,16 @@ If (-not $NoOverrides) {
}
}
$ranges.RemoveAll({ $args[0].Width -eq [CodepointWidth]::Narrow }) | Out-Null
$c = 0
ForEach($_ in $ranges) {
$c += $_.End - $_.Start + 1
}
# Emit Code
" // Generated by {0} -Pack:{1} -Full:{2} -NoOverrides:{3}" -f $MyInvocation.MyCommand.Name, $Pack, $Full, $NoOverrides
" // on {0} (UTC) from {1}." -f (Get-Date -AsUTC), $InputObject.ucd.description
" // on {0} from {1}." -f (Get-Date -AsUTC -Format "u"), $InputObject.ucd.description
" // {0} (0x{0:X}) codepoints covered." -f $c
If (-not $NoOverrides) {
" // {0} (0x{0:X}) codepoints overridden." -f $overrideCount
@@ -270,11 +265,12 @@ If (-not $NoOverrides) {
}
" static constexpr std::array<UnicodeRange, {0}> s_wideAndAmbiguousTable{{" -f $ranges.Count
ForEach($_ in $ranges) {
$isAmbiguous = $_.Width -eq [CodepointWidth]::Ambiguous
$comment = ""
if ($null -ne $_.Comment) {
# We only vend comments when we aren't packing tightly
$comment = " // {0}" -f $_.Comment
}
" UnicodeRange{{ 0x{0:x}, 0x{1:x}, CodepointWidth::{2} }},{3}" -f $_.Start, $_.End, $_.Width, $comment
" UnicodeRange{{ 0x{0:x}, 0x{1:x}, {2} }},{3}" -f $_.Start, $_.End, [int]$isAmbiguous, $comment
}
" };"