// Copyright 2020 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package classifier import ( "strings" "unicode" "github.com/davecgh/go-spew/spew" "github.com/sergi/go-diff/diffmatchpatch" ) // return values for the distance function that explain why a diff // is not an acceptable match for the source document. const ( versionChange = -1 introducedPhraseChange = -2 lesserGPLChange = -3 ) // score computes a metric of similarity between the known and unknown // document, including the offsets into the unknown that yield the content // generating the computed similarity. func (c *Classifier) score(id string, unknown, known *indexedDocument, unknownStart, unknownEnd int) (float64, int, int) { if c.tc.traceScoring(known.s.origin) { c.tc.trace("Scoring %s: [%d-%d]", known.s.origin, unknownStart, unknownEnd) } knownLength := known.size() diffs := docDiff(id, unknown, unknownStart, unknownEnd, known, 0, knownLength) start, end := diffRange(known.Norm, diffs) distance := scoreDiffs(id, diffs[start:end]) if c.tc.traceScoring(known.s.origin) { c.tc.trace("Diffs against %s:\n%s", known.s.origin, spew.Sdump(diffs[start:end])) } if distance < 0 { // If the distance is negative, this indicates an unacceptable diff so we return a zero-confidence match. if c.tc.traceScoring(known.s.origin) { c.tc.trace("Distance result %v, rejected match", distance) } return 0.0, 0, 0 } // Applying the diffRange-generated offsets provides the run of text from the // target most closely correlated to the source. This is the process for // compensating for errors from the searchset. With the reduced text, we // compute the final confidence score and exact text locations for the // matched content. // The diff slice consists of three regions: an optional deletion diff for // target text before the source, n elements that represent the diff between // the source and target, and an optional deletion diff for text after the // target. The helper functions identify the portions of the slice // corresponding to those regions. This results in a more accurate // confidence score and better position detection of the source in the // target. conf, so, eo := confidencePercentage(knownLength, distance), textLength(diffs[:start]), textLength(diffs[end:]) if c.tc.traceScoring(known.s.origin) { c.tc.trace("Score result: %v [%d-%d]", conf, so, eo) } return conf, so, eo } // confidencePercentage computes a confidence match score for the lengths, // handling the cases where source and target lengths differ. func confidencePercentage(klen, distance int) float64 { // No text is matched at 100% confidence (avoid divide by zero). if klen == 0 { return 1.0 } // Return a computed fractional match against the known text. return 1.0 - float64(distance)/float64(klen) } // diffLevenshteinWord computes word-based Levenshtein count. func diffLevenshteinWord(diffs []diffmatchpatch.Diff) int { levenshtein := 0 insertions := 0 deletions := 0 for _, aDiff := range diffs { switch aDiff.Type { case diffmatchpatch.DiffInsert: insertions += wordLen(aDiff.Text) case diffmatchpatch.DiffDelete: deletions += wordLen(aDiff.Text) case diffmatchpatch.DiffEqual: // A deletion and an insertion is one substitution. levenshtein += max(insertions, deletions) insertions = 0 deletions = 0 } } levenshtein += max(insertions, deletions) return levenshtein } func isVersionNumber(in string) bool { for _, r := range in { if !unicode.IsDigit(r) && r != '.' { return false } } return true } // scoreDiffs returns a score rating the acceptability of these diffs. A // negative value means that the changes represented by the diff are not an // acceptable transformation since it would change the underlying license. A // positive value indicates the Levenshtein word distance. func scoreDiffs(id string, diffs []diffmatchpatch.Diff) int { // We make a pass looking for unacceptable substitutions // Delete diffs are always ordered before insert diffs. This is leveraged to // analyze a change by checking an insert against the delete text that was // previously cached. prevText := "" prevDelete := "" for i, diff := range diffs { text := diff.Text switch diff.Type { case diffmatchpatch.DiffInsert: num := text if i := strings.Index(num, " "); i != -1 { num = num[0:i] } if isVersionNumber(num) && strings.HasSuffix(prevText, "version") { if !strings.HasSuffix(prevText, "the standard version") && !strings.HasSuffix(prevText, "the contributor version") { return versionChange } } // There are certain phrases that can't be introduced to make a license // hit. TODO: would like to generate this programmatically. Most of // these are words or phrases that appear in a single/small number of // licenses. Can we leverage frequency analysis to identify these // interesting words/phrases and auto-extract them? inducedPhrases := map[string][]string{ "AGPL": {"affero"}, "Atmel": {"atmel"}, "Apache": {"apache"}, "BSD": {"bsd"}, "BSD-3-Clause-Attribution": {"acknowledgment"}, "bzip2": {"seward"}, "GPL-2.0-with-GCC-exception": {"gcc linking exception"}, "GPL-2.0-with-autoconf-exception": {"autoconf exception"}, "GPL-2.0-with-bison-exception": {"bison exception"}, "GPL-2.0-with-classpath-exception": {"class path exception"}, "GPL-2.0-with-font-exception": {"font exception"}, "LGPL-2.0": {"library"}, "ImageMagick": {"imagemagick"}, "PHP": {"php"}, "SISSL": {"sun standards"}, "SGI-B": {"silicon graphics"}, "SunPro": {"sunpro"}, "X11": {"x consortium"}, } for k, ps := range inducedPhrases { if strings.HasPrefix(LicenseName(id), k) { for _, p := range ps { if strings.Index(text, p) != -1 { // Check to make sure there isn't a corresponding diff for this // insert that also contains the text. This prevents against diff // blocks that are too big and force a false hit on this check, // which usually happens with URLs since they are stored in one // token but can happen in other cases as well. We don't look just // for delete diffs because the subsequent text may reference the // content in case a URL was truncated. if i+1 < len(diffs) && strings.Index(diffs[i+1].Text, p) != -1 { continue } return introducedPhraseChange } } } } // Ignore changes between "library" and "lesser" in a GNU context as they // changed the terms, but look for introductions of Lesser that would // otherwise disqualify a match. if text == "lesser" && strings.HasSuffix(prevText, "gnu") && prevDelete != "library" { // The LGPL 3.0 doesn't have a standard header, so people tend to craft // their own. As a result, sometimes the warranty clause refers to the // GPL instead of the LGPL. This is fine from a licensing perspective, // but we need to tweak matching to ignore that particular case. In // other circumstances, inserting or removing the word Lesser in the // GPL context is not an acceptable change. There is also a reference to // it when suggesting to use the LGPL. if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") { return lesserGPLChange } } case diffmatchpatch.DiffEqual: prevText = text prevDelete = "" case diffmatchpatch.DiffDelete: // Avoid substitution in most cases. The two exceptions are with usage // statements that are talking about *another* license, and don't affect // the detection of the current license. if (text == "lesser" || text == "library") && strings.HasSuffix(prevText, "gnu") { // Same as above to avoid matching GPL instead of LGPL here. if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") { return lesserGPLChange } } prevDelete = text } } return diffLevenshteinWord(diffs) }