// Copyright 2020 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package classifier import ( "fmt" "strings" "testing" "github.com/sergi/go-diff/diffmatchpatch" ) func TestLevenshteinDiff(t *testing.T) { tests := []struct { name string diffs []diffmatchpatch.Diff expected int }{ { name: "identical text", diffs: []diffmatchpatch.Diff{ { Type: diffmatchpatch.DiffEqual, Text: "equivalent text", }, }, expected: 0, }, { name: "changed text", // Adjacent inverse changes get scored with the maximum of the 2 change scores diffs: []diffmatchpatch.Diff{ { Type: diffmatchpatch.DiffDelete, Text: "removed words", }, { Type: diffmatchpatch.DiffInsert, Text: "inserted text here", }, }, expected: 3, }, { name: "inserted text", diffs: []diffmatchpatch.Diff{ { Type: diffmatchpatch.DiffEqual, Text: "identical words", }, { Type: diffmatchpatch.DiffInsert, Text: "inserted", }, }, expected: 1, }, { name: "deleted text", diffs: []diffmatchpatch.Diff{ { Type: diffmatchpatch.DiffDelete, Text: "many extraneous deleted words", }, { Type: diffmatchpatch.DiffEqual, Text: "before the equivalent text", }, }, expected: 4, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { if got := diffLevenshteinWord(test.diffs); got != test.expected { t.Errorf("got %d wanted %d", got, test.expected) } }) } } func TestScoreDiffs(t *testing.T) { tests := []struct { name string license string diffs []diffmatchpatch.Diff expected int }{ { name: "identical text", license: "License/MIT/license.txt", diffs: nil, expected: 0, }, { name: "acceptable change", license: "License/MIT/license.txt", diffs: []diffmatchpatch.Diff{ { Type: diffmatchpatch.DiffEqual, Text: "license", }, { Type: diffmatchpatch.DiffInsert, Text: "as needed", }, { Type: diffmatchpatch.DiffDelete, Text: "when necessary", }, }, expected: 2, }, { name: "version change", license: "License/MIT/license.txt", diffs: []diffmatchpatch.Diff{ { Type: diffmatchpatch.DiffEqual, Text: "version", }, { Type: diffmatchpatch.DiffInsert, Text: "2", }, }, expected: versionChange, }, { name: "license name change by deletion", license: "License/MIT/license.txt", diffs: []diffmatchpatch.Diff{ { Type: diffmatchpatch.DiffEqual, Text: "gnu", }, { Type: diffmatchpatch.DiffDelete, Text: "lesser", }, }, expected: lesserGPLChange, }, { name: "license name change by insertion", license: "License/MIT/license.txt", diffs: []diffmatchpatch.Diff{ { Type: diffmatchpatch.DiffEqual, Text: "gnu", }, { Type: diffmatchpatch.DiffInsert, Text: "lesser", }, }, expected: lesserGPLChange, }, { name: "license name change by name insertion", license: "License/ImageMagick/license.txt", diffs: []diffmatchpatch.Diff{ { Type: diffmatchpatch.DiffEqual, Text: "license", }, { Type: diffmatchpatch.DiffInsert, Text: "imagemagick", }, }, expected: introducedPhraseChange, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { if got := scoreDiffs(test.license, test.diffs); got != test.expected { t.Errorf("got %d, want %d", got, test.expected) } }) } } func TestConfidencePercentage(t *testing.T) { tests := []struct { name string klen, distance int expected float64 }{ { name: "empty text", klen: 0, distance: 0, expected: 1.0, }, { name: "99% match", klen: 100, distance: 1, expected: 0.99, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { if got := confidencePercentage(test.klen, test.distance); got != test.expected { t.Errorf("got %v want %v", got, test.expected) } }) } } func TestScore(t *testing.T) { tests := []struct { name string known, unknown string expectedConf float64 expectedStart, expectedEnd int }{ { name: "identical text", known: "here is some sample text", unknown: "here is some sample text", expectedConf: 1.00, expectedStart: 0, expectedEnd: 0, }, { name: "close match with matching sizes", known: "here is some sample text", unknown: "here is different sample text", expectedConf: .8, expectedStart: 0, expectedEnd: 0, }, { name: "close match with different sizes", known: "here is some sample text", unknown: "padding before here is different sample text", expectedConf: .8, expectedStart: 2, expectedEnd: 0, }, { name: "no match due to unacceptable diff", known: "here is some sample text for version 2 of the license", unknown: "padding before here is different sample text for version 3 of the licenses", expectedConf: 0.0, expectedStart: 0, expectedEnd: 0, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { var trace strings.Builder c := NewClassifier(.8) c.SetTraceConfiguration(&TraceConfiguration{ TraceLicenses: "*", TracePhases: "*", Tracer: func(f string, args ...interface{}) { trace.WriteString(fmt.Sprintf(f, args...)) }, }) c.AddContent("", "known", "", []byte(test.known)) kd := c.getIndexedDocument("", "known", "") ud := c.createTargetIndexedDocument([]byte(test.unknown)) // The name for the test needs to look like an asset path so we prepend // the directory. conf, so, eo := c.score("License/"+test.name, ud, kd, 0, ud.size()) success := true if conf != test.expectedConf { t.Errorf("conf: got %v want %v", conf, test.expectedConf) success = false } if so != test.expectedStart { t.Errorf("start offset: got %v want %v", so, test.expectedStart) success = false } if eo != test.expectedEnd { t.Errorf("end offset: got %v want %v", so, test.expectedEnd) success = false } if !success { t.Errorf("Trace:\n%s", trace.String()) } }) } }