// Copyright 2023 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // // This executable generates a comparison table in HTML format // for all ICU implementations presented in a given root folder package main import ( "errors" "flag" "fmt" "go.skia.org/skia/tools/unicode_comparison/go/helpers" "html/template" "os" "path/filepath" "sort" "strconv" "strings" ) type Range struct { Start int End int Type string } // Main numeric type type Ratio struct { Num int Total int } type FloatRatio struct { Top float64 Bottom float64 } func (r Ratio) Divide() string { if r.Num == 0 { return " " } else { return fmt.Sprintf("%.6f", float64(r.Num)/float64(r.Total)) } } func (fr FloatRatio) Percents() string { if fr.Top == fr.Bottom { return " " } else { return fmt.Sprintf("%.6f", (float64(fr.Top)/float64(fr.Bottom)-1)*100) } } func (r *Ratio) Add(other Ratio) { if other.Num > 0 { r.Num += other.Num r.Total += other.Total } } type CalculatedDelta struct { Memory float64 Performance FloatRatio Disk float64 RowCount int DiffCount int Graphemes Ratio SoftBreaks Ratio HardBreaks Ratio Whitespaces Ratio Words Ratio Controls Ratio Data RangedData } func NewCalculatedDelta() CalculatedDelta { return CalculatedDelta{ Performance: FloatRatio{0, 1}, Graphemes: Ratio{0, 1}, SoftBreaks: Ratio{0, 1}, HardBreaks: Ratio{0, 1}, Whitespaces: Ratio{0, 1}, Words: Ratio{0, 1}, Controls: Ratio{0, 1}, } } func (cd *CalculatedDelta) Add(other CalculatedDelta) { cd.Performance.Top += other.Performance.Top cd.Performance.Bottom += other.Performance.Bottom cd.Memory += other.Memory cd.Disk += other.Disk cd.RowCount += 1 cd.DiffCount += other.DiffCount cd.Graphemes.Add(other.Graphemes) cd.SoftBreaks.Add(other.SoftBreaks) cd.HardBreaks.Add(other.HardBreaks) cd.Whitespaces.Add(other.Whitespaces) cd.Words.Add(other.Words) cd.Controls.Add(other.Controls) } type RangeDataSet struct { Graphemes []Range SoftBreaks []Range HardBreaks []Range Whitespaces []Range Words []Range Controls []Range } type RangedData struct { Missing RangeDataSet Extra RangeDataSet } type ParsedData struct { Count int Time float64 Memory float64 Graphemes []int SoftBreaks []int HardBreaks []int Whitespaces []int Words []int Controls []int } func NewParsedData() ParsedData { return ParsedData{} } // Row type type Row struct { Id string Num string ParentId string Names []string Text string IsFile bool Delta CalculatedDelta Children []Row } func NewImpl(impl string) *Row { return &Row{Names: []string{impl}, IsFile: false, Delta: NewCalculatedDelta(), Children: nil} } func NewLocale(impl string, locale string) *Row { return &Row{Names: []string{impl, locale}, IsFile: false, Delta: NewCalculatedDelta(), Children: nil} } func NewSize(names ...string) *Row { return &Row{Names: names, IsFile: false, Delta: NewCalculatedDelta(), Children: nil} } func NewRow(text string, delta CalculatedDelta, names ...string) *Row { return &Row{Names: names, IsFile: true, Text: text, Delta: delta, Children: nil} } type Chunk struct { Text string Classes string Indexes Range } func (row Row) FormattedChunks(ranges []Range, name string, chunkType string, includeRange bool) []Chunk { var results []Chunk gap := 0 for i, r := range ranges { if i == 0 { continue } if r.Start > gap { text := row.Text[gap:r.Start] results = append(results, Chunk{text, "", Range{gap, r.Start, ""}}) } if includeRange { text := row.Text[r.Start:r.End] if name == "whitespace" { corrected := "" for _, t := range text { if t == ' ' { corrected += "nbsp;" } else { corrected += string(t) } } text = corrected } results = append(results, Chunk{text, name, Range{r.Start, r.End, chunkType}}) gap = r.End } else { results = append(results, Chunk{"\u200B", name, Range{r.Start, r.Start, chunkType}}) gap = r.Start } } if gap < len(row.Text) { text := row.Text[gap:] results = append(results, Chunk{text, "", Range{gap, len(row.Text), ""}}) } return results } func (r Row) FormattedMissingGraphemes() []Chunk { return r.FormattedChunks(r.Delta.Data.Missing.Graphemes, "grapheme", "missing", true) } func (r Row) FormattedExtraGraphemes() []Chunk { return r.FormattedChunks(r.Delta.Data.Extra.Graphemes, "grapheme", "extra", true) } func (r Row) FormattedMissingSoftBreaks() []Chunk { return r.FormattedChunks(r.Delta.Data.Missing.SoftBreaks, "softBreak", "missing", false) } func (r Row) FormattedExtraSoftBreaks() []Chunk { return r.FormattedChunks(r.Delta.Data.Extra.SoftBreaks, "softBreak", "extra", false) } func (r Row) FormattedMissingHardBreaks() []Chunk { return r.FormattedChunks(r.Delta.Data.Missing.HardBreaks, "hardBreak", "missing", false) } func (r Row) FormattedExtraHardBreaks() []Chunk { return r.FormattedChunks(r.Delta.Data.Extra.HardBreaks, "hardBreak", "extra", false) } func (r Row) FormattedMissingWords() []Chunk { return r.FormattedChunks(r.Delta.Data.Missing.Words, "word", "missing", false) } func (r Row) FormattedExtraWords() []Chunk { return r.FormattedChunks(r.Delta.Data.Extra.Words, "word", "extra", false) } func (r Row) FormattedMissingWhitespaces() []Chunk { return r.FormattedChunks(r.Delta.Data.Missing.Whitespaces, "whitespace", "missing", true) } func (r Row) FormattedExtraWhitespaces() []Chunk { return r.FormattedChunks(r.Delta.Data.Extra.Whitespaces, "whitespace", "extra", true) } func (r Row) FormattedMissingControls() []Chunk { return r.FormattedChunks(r.Delta.Data.Missing.Controls, "control", "missing", false) } func (r Row) FormattedExtraControls() []Chunk { return r.FormattedChunks(r.Delta.Data.Extra.Controls, "control", "extra", false) } func (r *Row) Add(child Row) { r.Delta.Add(child.Delta) r.Children = append(r.Children, child) } func (r Row) Name() string { if r.IsFile { return r.Names[len(r.Names)-1] } else { names := "" for i, name := range r.Names { if i > 0 { names += "." } names += name } return names } } func (r Row) ParentName() string { names := "" for i, name := range r.Names { if i == len(r.Names)-1 { break } if i > 0 { names += "." } names += name } return names } func (r Row) Implementation() string { return r.Names[0] } func (r Row) Level() string { return fmt.Sprintf("l%d", len(r.Names)) } func (r Row) HasText() bool { return len(r.Text) != 0 } func (r Row) HasChildren() bool { return len(r.Children) > 0 } func (r Row) IsImplementation() bool { return len(r.Names) == 1 } func (r Row) HasNoDifferences() bool { return len(r.Delta.Data.Missing.Graphemes) == 0 && len(r.Delta.Data.Missing.SoftBreaks) == 0 && len(r.Delta.Data.Missing.HardBreaks) == 0 && len(r.Delta.Data.Missing.Words) == 0 && len(r.Delta.Data.Missing.Whitespaces) == 0 && len(r.Delta.Data.Missing.Controls) == 0 && len(r.Delta.Data.Extra.Graphemes) == 0 && len(r.Delta.Data.Extra.SoftBreaks) == 0 && len(r.Delta.Data.Extra.HardBreaks) == 0 && len(r.Delta.Data.Extra.Words) == 0 && len(r.Delta.Data.Extra.Whitespaces) == 0 && len(r.Delta.Data.Extra.Controls) == 0 } func (r Row) Differences() int { return len(r.Delta.Data.Missing.Graphemes) + len(r.Delta.Data.Missing.SoftBreaks) + len(r.Delta.Data.Missing.HardBreaks) + len(r.Delta.Data.Missing.Words) + len(r.Delta.Data.Missing.Whitespaces) + len(r.Delta.Data.Missing.Controls) + len(r.Delta.Data.Extra.Graphemes) + len(r.Delta.Data.Extra.SoftBreaks) + len(r.Delta.Data.Extra.HardBreaks) + len(r.Delta.Data.Extra.Words) + len(r.Delta.Data.Extra.Whitespaces) + len(r.Delta.Data.Extra.Controls) } func (r Row) HasMissingGraphemes() bool { return len(r.Delta.Data.Missing.Graphemes) > 0 } func (r Row) HasExtraGraphemes() bool { return len(r.Delta.Data.Extra.Graphemes) > 0 } func (r Row) HasNoGraphemes() bool { return !r.HasMissingGraphemes() && !r.HasExtraGraphemes() } func (r Row) HasMissingSoftBreaks() bool { return len(r.Delta.Data.Missing.SoftBreaks) > 0 } func (r Row) HasExtraSoftBreaks() bool { return len(r.Delta.Data.Extra.SoftBreaks) > 0 } func (r Row) HasNoSoftBreaks() bool { return !r.HasMissingSoftBreaks() && !r.HasExtraSoftBreaks() } func (r Row) HasMissingHardBreaks() bool { return len(r.Delta.Data.Missing.HardBreaks) > 0 } func (r Row) HasExtraHardBreaks() bool { return len(r.Delta.Data.Extra.HardBreaks) > 0 } func (r Row) HasNoHardBreaks() bool { return !r.HasMissingHardBreaks() && !r.HasExtraHardBreaks() } func (r Row) HasMissingWhitespaces() bool { return len(r.Delta.Data.Missing.Whitespaces) > 0 } func (r Row) HasExtraWhitespaces() bool { return len(r.Delta.Data.Extra.Whitespaces) > 0 } func (r Row) HasNoWhitespaces() bool { return !r.HasMissingWhitespaces() && !r.HasExtraWhitespaces() } func (r Row) HasMissingWords() bool { return len(r.Delta.Data.Missing.Words) > 0 } func (r Row) HasExtraWords() bool { return len(r.Delta.Data.Extra.Words) > 0 } func (r Row) HasNoWords() bool { return !r.HasMissingWords() && !r.HasExtraWords() } func (r Row) HasMissingControls() bool { return len(r.Delta.Data.Missing.Controls) > 0 } func (r Row) HasExtraControls() bool { return len(r.Delta.Data.Extra.Controls) > 0 } func (r Row) HasNoControls() bool { return !r.HasMissingControls() && !r.HasExtraControls() } func (r Row) MissingGraphemeNum() int { return len(r.Delta.Data.Missing.Graphemes) - 1 } func (r Row) ExtraGraphemeNum() int { return len(r.Delta.Data.Extra.Graphemes) - 1 } func (r Row) MissingSoftBreakNum() int { return len(r.Delta.Data.Missing.SoftBreaks) - 1 } func (r Row) ExtraSoftBreakNum() int { return len(r.Delta.Data.Extra.SoftBreaks) - 1 } func (r Row) MissingHardBreakNum() int { return len(r.Delta.Data.Missing.HardBreaks) - 1 } func (r Row) ExtraHardBreakNum() int { return len(r.Delta.Data.Extra.HardBreaks) - 1 } func (r Row) MissingWhitespaceNum() int { return len(r.Delta.Data.Missing.Whitespaces) - 1 } func (r Row) ExtraWhitespaceNum() int { return len(r.Delta.Data.Extra.Whitespaces) - 1 } func (r Row) MissingWordNum() int { return len(r.Delta.Data.Missing.Words) - 1 } func (r Row) ExtraWordNum() int { return len(r.Delta.Data.Extra.Words) - 1 } func (r Row) MissingControlNum() int { return len(r.Delta.Data.Missing.Controls) - 1 } func (r Row) ExtraControlNum() int { return len(r.Delta.Data.Extra.Controls) - 1 } type WebPage struct { Title string Heading string Rows []Row } func assignIDs(children []Row, parentId, parentNum string) { for i := range children { children[i].Num = fmt.Sprintf("%s_%d", parentNum, i+1) children[i].Id = fmt.Sprintf("%s_%d", parentId, i+1) children[i].ParentId = parentId assignIDs(children[i].Children, children[i].Id, children[i].Num) } } func addImpl(web *WebPage, impl Row) { impl.Num = fmt.Sprintf("%d", len(web.Rows)+1) impl.Id = fmt.Sprintf("id_%d", len(web.Rows)+1) impl.ParentId = "" assignIDs(impl.Children, impl.Id, impl.Num) web.Rows = append(web.Rows, impl) } func parseFile(path string, textLen int) (ParsedData, error) { var result ParsedData // Time: float64 // Memory: float64 // Graphemes: n1 n2 ... // SoftBreaks: n1 n2 ... // HardBreaks: n1 n2 ... // Whitespaces: n1 n2 ... // Words: n1 n2 ... // Controls: n1 n2 ... content, err := os.ReadFile(path) if err != nil { return result, err } lines := strings.Split(string(content), "\n") if len(lines) < 8 { return result, errors.New("Wrong data format (number of lines)") } result.Time, err = strconv.ParseFloat(lines[0], 64) if err != nil { return result, errors.New("Wrong data format (time)") } result.Memory, err = strconv.ParseFloat(lines[1], 64) if err != nil { return result, errors.New("Wrong data format (memory)") } result.Graphemes = helpers.SplitAsInts(lines[2]+" "+strconv.Itoa(textLen), " ") result.SoftBreaks = helpers.SplitAsInts(lines[3]+" "+strconv.Itoa(textLen), " ") result.HardBreaks = helpers.SplitAsInts(lines[4]+" "+strconv.Itoa(textLen), " ") result.Whitespaces = helpers.SplitAsInts(lines[5]+" "+strconv.Itoa(textLen), " ") result.Words = helpers.SplitAsInts(lines[6]+" "+strconv.Itoa(textLen), " ") result.Controls = helpers.SplitAsInts(lines[7]+" "+strconv.Itoa(textLen), " ") return result, nil } func compareLines(expected []int, actual []int, includeRange bool, missing bool) (Ratio, []Range) { var diff []Range diff = append(diff, Range{len(actual), len(expected), ""}) aLen := len(actual) - 1 eLen := len(expected) - 1 e := 1 a := 1 for e < eLen || a < aLen { a1 := actual[a] if includeRange && a < aLen { a1 = actual[a+1] } e1 := expected[e] if includeRange && e < eLen { e1 = expected[e+1] } if e >= eLen { if !missing { diff = append(diff, Range{helpers.Abs(actual[a]), helpers.Abs(a1), "extra"}) } a += 1 } else if a >= aLen { if missing { diff = append(diff, Range{helpers.Abs(expected[e]), helpers.Abs(e1), "missing"}) } e += 1 } else if actual[a] < expected[e] { if !missing { diff = append(diff, Range{helpers.Abs(actual[a]), helpers.Abs(a1), "extra"}) } a += 1 } else if actual[a] > expected[e] { if missing { diff = append(diff, Range{helpers.Abs(expected[e]), helpers.Abs(e1), "missing"}) } e += 1 } else { a += 1 e += 1 } } // TODO: keep the difference, too if len(diff) > 1 { return Ratio{len(diff) - 1, len(expected)}, diff } else { return Ratio{0, 1}, nil } } func compareData(expected ParsedData, actual ParsedData) CalculatedDelta { var delta CalculatedDelta delta.Performance.Top = actual.Time delta.Performance.Bottom = expected.Time var deltaGraphemes, deltaSoftBreaks, deltaHardBreaks, deltaWhitespaces, deltaWords, deltaControls Ratio deltaGraphemes, delta.Data.Missing.Graphemes = compareLines(expected.Graphemes, actual.Graphemes, true, true) deltaGraphemes, delta.Data.Missing.Graphemes = compareLines(expected.Graphemes, actual.Graphemes, true, true) deltaSoftBreaks, delta.Data.Missing.SoftBreaks = compareLines(expected.SoftBreaks, actual.SoftBreaks, false, true) deltaHardBreaks, delta.Data.Missing.HardBreaks = compareLines(expected.HardBreaks, actual.HardBreaks, false, true) deltaWhitespaces, delta.Data.Missing.Whitespaces = compareLines(expected.Whitespaces, actual.Whitespaces, true, true) deltaWords, delta.Data.Missing.Words = compareLines(expected.Words, actual.Words, true, true) deltaControls, delta.Data.Missing.Controls = compareLines(expected.Controls, actual.Controls, false, true) delta.Graphemes.Add(deltaGraphemes) delta.SoftBreaks.Add(deltaSoftBreaks) delta.HardBreaks.Add(deltaHardBreaks) delta.Whitespaces.Add(deltaWhitespaces) delta.Words.Add(deltaWords) delta.Controls.Add(deltaControls) deltaGraphemes, delta.Data.Extra.Graphemes = compareLines(expected.Graphemes, actual.Graphemes, true, false) deltaSoftBreaks, delta.Data.Extra.SoftBreaks = compareLines(expected.SoftBreaks, actual.SoftBreaks, false, false) deltaHardBreaks, delta.Data.Extra.HardBreaks = compareLines(expected.HardBreaks, actual.HardBreaks, false, false) deltaWhitespaces, delta.Data.Extra.Whitespaces = compareLines(expected.Whitespaces, actual.Whitespaces, true, false) deltaWords, delta.Data.Extra.Words = compareLines(expected.Words, actual.Words, true, false) deltaControls, delta.Data.Extra.Controls = compareLines(expected.Controls, actual.Controls, false, false) delta.Graphemes.Add(deltaGraphemes) delta.SoftBreaks.Add(deltaSoftBreaks) delta.HardBreaks.Add(deltaHardBreaks) delta.Whitespaces.Add(deltaWhitespaces) delta.Words.Add(deltaWords) delta.Controls.Add(deltaControls) return delta } func printDifference(text string, diff []int) { count := diff[0] if len(diff) <= 1 { // No diff } else if (len(diff)-1)*10 < count { // Too small diff fmt.Printf("%d < %d:\n%s\n", (len(diff)-1)*10, count, text) return } else if count == 0 { // Too small string fmt.Printf("%d == 0:\n%s\n", count, text) return } first := helpers.Abs(diff[1]) last := first + 10 if last >= len(text) { last = len(text) - 1 } fmt.Printf("Difference @%d:\n%s\n", first, text[:last]) } func finishRows(rows []Row, start int) []Row { if len(rows) == 0 { return []Row{} } i := len(rows) - 1 for i > start { (rows)[i-1].Add((rows)[i]) i -= 1 } if start > 0 { return rows[:start] } else { return rows[:start+1] } } func findParentRow(rows []Row, name string) int { for i := range rows { row := rows[len(rows)-1-i] if row.Names[len(row.Names)-1] == name { return len(rows) - 1 - i } } return -1 } func compareFiles(inputPath string, sampleLimit int) (WebPage, error) { var rows []Row // Define the data to be used in the template web := WebPage{ Title: "Comparison Table (accuracy, performance and disk memory)", } err := filepath.Walk(inputPath, func(inputFile string, info os.FileInfo, err error) error { if err != nil { fmt.Println(err) return err } tokens := strings.Split(inputFile, string(os.PathSeparator)) outputIndex := -1 for i, t := range tokens { if t == "output" { outputIndex = i break } } if outputIndex < 0 { return fmt.Errorf("Currently only supported directory structure: [...]/output/{implementation}/{locale}:\n%s\n", inputFile) } if info.IsDir() { if len(tokens) == outputIndex+1 { // ~/datasets/output } else if len(tokens) == outputIndex+2 { // ~/datasets/output/icu rows = finishRows(rows, 0) if len(rows) > 0 { addImpl(&web, rows[0]) } rows = []Row{*NewImpl(tokens[outputIndex+1])} } else if len(tokens) == outputIndex+3 { // ~/datasets/output/icu/en rows = finishRows(rows, 0) rows = append(rows, *NewLocale(rows[0].Names[0], tokens[outputIndex+2])) } else { fmt.Printf("skipping %s\n", inputFile) return nil } } else if len(rows) <= 1 { return errors.New(fmt.Sprintf("Wrong directory structure: %s\n", inputFile)) } else { // Find the parent row parent := &rows[len(rows)-1] impl := parent.Names[0] // Read and parse the data textFile := strings.Replace(inputFile, filepath.Join("output", impl), "input", 1) textContent, err := os.ReadFile(textFile) helpers.Check(err) if len(textContent) == 0 { fmt.Printf("Empty text file %s\n", inputFile) return nil } var actualData ParsedData actualData, err = parseFile(inputFile, len(textContent)) if err != nil { return errors.New(fmt.Sprintf("Cannot parse output file %s: %s\n", inputFile, err.Error())) } var validationData ParsedData validationFile := strings.Replace(inputFile, filepath.Join("output", impl), "validation", 1) validationData, err = parseFile(validationFile, len(textContent)) if err != nil { return errors.New(fmt.Sprintf("Cannot parse validation file%s: %s\n", validationFile, err.Error())) } // Compare the data var delta CalculatedDelta _, shortFileName := filepath.Split(inputFile) delta = compareData(validationData, actualData) row := NewRow(string(textContent), delta, append(parent.Names, shortFileName)...) if !row.HasNoDifferences() { parent.Add(*row) sort.Slice(parent.Children, func(i, j int) bool { return parent.Children[i].Differences() > parent.Children[j].Differences() }) if len(parent.Children) > sampleLimit { parent.Children = parent.Children[0 : sampleLimit-1] } } } return nil }) rows = finishRows(rows, 0) if len(rows) > 0 { addImpl(&web, rows[0]) } return web, err } func main() { var ( root = flag.String("root", "~/datasets", "Folder (inputs for the table expected to be under /output/>") sampleLimit = flag.Int("sampleLimit", 10, "Number of files to show with differences") ) flag.Parse() if *root == "" { fmt.Println("Must set --root") flag.PrintDefaults() } *root = helpers.ExpandPath(*root) // Parse the template t, err := template.ParseFiles("../html/index.html", "../html/scripts.html", "../html/styles.html", "../html/tbody.html") helpers.Check(err) // Create index.html indexPath := filepath.Join(*root, "index.html") indexFile, err := os.Create(indexPath) helpers.Check(err) outputPath := filepath.Join(*root, "output") web, err := compareFiles(outputPath, *sampleLimit) helpers.Check(err) // Execute the template and write the result to index.html err = t.Execute(indexFile, web) helpers.Check(err) indexFile.Close() }