// Copyright 2020 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package classifier import ( "fmt" "hash/crc32" "math" "sort" "github.com/davecgh/go-spew/spew" ) // searchSet is a set of q-grams that have hashes associated with them, // making it fast to search for potential matches. type searchSet struct { // Tokens is a tokenized list of the original input string. Tokens []indexedToken // Hashes is a map of checksums to a range of tokens. Hashes hash // Checksums is a list of checksums ordered from longest range to // shortest. Checksums []uint32 // ChecksumRanges are the token ranges for the above checksums. ChecksumRanges tokenRanges origin string // A debugging identifier to label what this searchset is associated with nodes []*node q int // The length of q-grams in this searchset. } // node consists of a range of tokens along with the checksum for those tokens. type node struct { checksum uint32 tokens *tokenRange } func (n *node) String() string { return fmt.Sprintf("[%d:%d]", n.tokens.Start, n.tokens.End) } // newSearchSet creates a new searchSet object. A searchset generates all // possible q-grams of tokens. These q-grams of tokens can be correlated to // determine where a section of text from one source may appear in another // source. func newSearchSet(s *indexedDocument, q int) *searchSet { // Start generating hash values for all q-grams within the text. h := make(hash) if len(s.Tokens) < q { // We can't have a smaller q than the number of tokens. q = len(s.Tokens) } checksums, tokenRanges := generateHashes(h, q, s.Tokens, s.dict) sset := &searchSet{ Tokens: s.Tokens, Hashes: h, Checksums: checksums, ChecksumRanges: tokenRanges, q: q, } sset.generateNodeList() return sset } // tokenRange indicates the range of tokens that map to a particular checksum. type tokenRange struct { Start int End int } func (t *tokenRange) String() string { return fmt.Sprintf("[%v, %v)", t.Start, t.End) } // tokenRanges is a sortable type of a slice of TokenRange. type tokenRanges []*tokenRange // generateHashes computes a hash using CRC-32 for each q-gram encountered in the provided tokens. func generateHashes(h hash, q int, toks []indexedToken, dict *dictionary) ([]uint32, tokenRanges) { if q == 0 { return nil, nil } var css []uint32 var tr tokenRanges crc := crc32.NewIEEE() for offset := 0; offset+q <= len(toks); offset++ { crc.Reset() for i := 0; i < q; i++ { crc.Write([]byte(dict.getWord(toks[offset+i].ID))) crc.Write([]byte{' '}) } cs := crc.Sum32() css = append(css, cs) tr = append(tr, &tokenRange{offset, offset + q}) h.add(cs, offset, offset+q) } return css, tr } // generateNodeList creates a node list out of the search set. func (s *searchSet) generateNodeList() { if len(s.Tokens) == 0 { return } for i := 0; i < len(s.Checksums); i++ { s.nodes = append(s.nodes, &node{ checksum: s.Checksums[i], tokens: s.ChecksumRanges[i], }) } } // matchRange is the range within the source text that is a match to the range // in the target text. type matchRange struct { // Offsets into the source tokens. SrcStart, SrcEnd int // Offsets into the target tokens. TargetStart, TargetEnd int // TokensClaimed tracks the number of positively matched tokens in this // range. For initially created matchRanges, this is equal to the extent of // the range. However, as matchRanges get merged together and error is // potentially introduced, this tracks the precise number of tokens that // exist in the range. TokensClaimed int } // in returns true if the start and end are enclosed in the match range. func (m *matchRange) in(other *matchRange) bool { return m.TargetStart >= other.TargetStart && m.TargetEnd <= other.TargetEnd } func (m *matchRange) String() string { return fmt.Sprintf("S: [%v, %v)-> T: [%v, %v) => %v [%v]", m.SrcStart, m.SrcEnd, m.TargetStart, m.TargetEnd, m.TargetStart-m.SrcStart, m.TokensClaimed) } // matchRanges is a list of "matchRange"s. The ranges are monotonically // increasing in value and indicate a single potential occurrence of the source // text in the target text. They are sorted to support greedy matching with the // longest runs of q-grams appearing first in the sort. type matchRanges []*matchRange func (m matchRanges) Len() int { return len(m) } func (m matchRanges) Swap(i, j int) { m[i], m[j] = m[j], m[i] } func (m matchRanges) Less(i, j int) bool { if m[i].TokensClaimed != m[j].TokensClaimed { return m[i].TokensClaimed > m[j].TokensClaimed } if m[i].TargetStart != m[j].TargetStart { return m[i].TargetStart < m[j].TargetStart } return m[i].SrcStart < m[j].SrcStart } // findPotentialMatches returns the ranges in the target (unknown) text that // are best potential matches to the source (known) text. func (c *Classifier) findPotentialMatches(src, target *searchSet, confidence float64) matchRanges { matchedRanges := c.getMatchedRanges(src, target, confidence, src.q) if c.tc.traceSearchset(src.origin) { c.tc.trace("matchedRanges = %s", spew.Sdump(matchedRanges)) } if len(matchedRanges) == 0 { return nil } // After computing all potential matches, we only output ranges that contain // enough tokens to clear the confidence threshold. As noted, this number can // be too high, yielding false positives, but cannot yield false negatives. threshold := int(confidence * float64(len(src.Tokens))) for i, m := range matchedRanges { if m.TokensClaimed < threshold { matchedRanges = matchedRanges[:i] break } } if c.tc.traceSearchset(src.origin) { c.tc.trace("finalized matchedRanges for %s: %d = %s", src.origin, len(src.Tokens), spew.Sdump(matchedRanges)) } return matchedRanges } // fuseRanges analyzes the source matches, attempting to combine hits without // errors into larger hits with tolerable amounts of error to produce matches // that contain enough tokens to be considered for exact matching against a a // target document. This routine intentionally does not accurately track error // contributions from merging runs, trading false positives (but not false // negatives), for faster performance. func (c *Classifier) fuseRanges(origin string, matched matchRanges, confidence float64, size int, runs []matchRange, targetSize int) matchRanges { var claimed matchRanges errorMargin := int(math.Round(float64(size) * (1.0 - confidence))) filter := make([]bool, targetSize) for _, m := range runs { for i := m.SrcStart; i < m.SrcEnd; i++ { // Only consider these offsets if they fit into the target document, since // the target may be smaller than the source being analyzed if i < targetSize { filter[i] = true } } } filterDrops := 0 filterPasses := 0 // For each hit detected, compare it against all other previous hits to see if it can be part of match // or represents a group that is eligible for matching and having other hits contribute to it. for _, m := range matched { off := m.TargetStart - m.SrcStart // If the offset is negative, but within error margins, we associate it // with the first index to see if it could contribute to a run. If the // absolute offset is larger than the error margin, it can't possibly // contribute and will be dropped. This puts more potential error into the zero // index, but that just slightly increases the rate of false positives. In // practice, this would only be an issue if there are major substrings of a // source in a target that aren't part of a real hit. We see many small // references (the name of a license) but not large chunks of the license. if off < 0 { if -off <= errorMargin { off = 0 } else { continue } } // If the filter is false, there was not sufficient token density in that // part of the target document for a viable match, so this match is a // spurious hit and can be discarded. if !filter[off] { filterDrops++ continue } filterPasses++ unclaimed := true for _, c := range claimed { moff := m.TargetStart - m.SrcStart coff := c.TargetStart - c.SrcStart sampleError := int(math.Round(math.Abs(float64(moff - coff)))) withinError := sampleError < errorMargin // The contribution needs to add more value than accumulated error. This prevents // against spurious matches of a reference to a license incorrectly overextending the // match range. if withinError && m.TokensClaimed > int(sampleError) { if m.in(c) { // This can cause too many tokens to be claimed, but that's fine since we want to avoid // undercounting and missing content. c.TokensClaimed += m.TokensClaimed unclaimed = false } else { // See if the claims can be merged. If the error tolerances allow for it, // we merge the claims and the ranges. This doesn't accumulate error // accurately so it's possible that repeated merges can introduce too // much error to actually make a match, but we won't get false // negatives from this approach. The error tolerances allow for a // merge, but we only want to merge if it increases the range of // tokens being covered. If this match is already handled by an // existing (stronger by definition) claim, we don't merge this one, // but treat it as a new claim. This allows for the case where a // highly fragmented text will be matched by a long series of low // score matches. if m.TargetStart < c.TargetStart && m.SrcStart < c.SrcStart { c.TargetStart = m.TargetStart c.SrcStart = m.SrcStart c.TokensClaimed += m.TokensClaimed unclaimed = false } else if m.TargetEnd > c.TargetEnd && m.SrcEnd > c.SrcEnd { c.TargetEnd = m.TargetEnd c.SrcEnd = m.SrcEnd c.TokensClaimed += m.TokensClaimed unclaimed = false } // This claim does not extend any existing block, and it may be claimed in its own // right. } } if !unclaimed { break } } // Only create a claim if this claim is likely relevant. If we had some higher quality hits, // it's likely this is spurious noise. If we haven't had any significantly better hits, we'll keep // this around. if unclaimed && m.TokensClaimed*10 > matched[0].TokensClaimed { claimed = append(claimed, m) } } sort.Sort(claimed) if c.tc.traceSearchset(origin) { c.tc.trace("filterPasses = %+v", filterPasses) c.tc.trace("filterDrops = %+v", filterDrops) c.tc.trace("claimed = %s", spew.Sdump(claimed)) } return claimed } // getMatchedRanges finds the ranges in the target text that match the source // text. The ranges returned are ordered from the entries with the most matched // tokens to the least. func (c *Classifier) getMatchedRanges(src, target *searchSet, confidence float64, q int) matchRanges { shouldTrace := c.tc.traceSearchset(src.origin) if shouldTrace { c.tc.trace("src.origin = %+v", src.origin) } // Assemble a list of all the matched q-grams without any consideration to // error tolerances. matched := targetMatchedRanges(src, target) if shouldTrace { c.tc.trace("matched = %s", spew.Sdump(matched)) } if len(matched) == 0 { return nil } // Perform neighborhood matching to figure out which clusters of q-grams have // sufficient likelihood to be a potential match to the source. For an error // confidence threshold of X, we require that a sequence of N target tokens // must contain N*X (X <=1.0) ordered source tokens in order to be a viable // match. // // It's much easier to compute potential ranges in the target if we disregard // proper ordering of source tokens initially, and see which source q-grams // appear in sufficient quantities to be a potential match. We can then // disregard any q-gram that falls outside of that range. This helps // significantly since processing token matches is an N^2 (or worse) // operation, so reducing N is a big win. runs := c.detectRuns(src.origin, matched, len(target.Tokens), len(src.Tokens), confidence, q) if shouldTrace { c.tc.trace("runs = %d: %s", len(runs), spew.Sdump(runs)) } // If there are no target runs of source tokens, we're done. if len(runs) == 0 { return nil } // Using the runs as a filter to ignore irrelevant matches, fuse the source // match ranges into larger matches (with possible errors) to see if we can // produce large enough runs that pass the confidence threshold. fr := c.fuseRanges(src.origin, matched, confidence, len(src.Tokens), runs, len(target.Tokens)) if shouldTrace { c.tc.trace("fr = %s", spew.Sdump(fr)) } return fr } func (c *Classifier) detectRuns(origin string, matched matchRanges, targetLength, subsetLength int, threshold float64, q int) []matchRange { shouldTrace := c.tc.traceSearchset(origin) hits := make([]bool, targetLength) for _, m := range matched { for idx := m.TargetStart; idx < m.TargetEnd; idx++ { hits[idx] = true } } if len(hits) == 0 { return nil } var out []int total := 0 target := int(float64(subsetLength) * threshold) if shouldTrace { c.tc.trace("target = %+v", target) c.tc.trace("targetLength = %+v", targetLength) c.tc.trace("subsetLength = %+v", subsetLength) } // If we don't have at least 1 subset (i.e. the target is shorter than the // source) just analyze what we have. if len(hits) < subsetLength { if shouldTrace { c.tc.trace("trimmed search length from %d to %d", subsetLength, len(hits)) } subsetLength = len(hits) } // Initialize our sliding window value. for i := 0; i < subsetLength; i++ { if hits[i] { total++ } } if total >= target { out = append(out, 0) } // Now move through the window adjusting the total by subtracting out the // last bit and adding in the new bit. for i := 1; i < len(hits); i++ { if hits[i-1] { total-- } end := i + subsetLength - 1 if end < len(hits) && hits[i+subsetLength-1] { total++ } if total >= target { out = append(out, i) } } if len(out) == 0 { return nil } final := []matchRange{ { SrcStart: out[0], SrcEnd: out[0] + q, }, } for i := 1; i < len(out); i++ { if out[i] != 1+out[i-1] { final = append(final, matchRange{ SrcStart: out[i], SrcEnd: out[i] + q, }) } else { final[len(final)-1].SrcEnd = out[i] + q } } return final } func targetMatchedRanges(src, target *searchSet) matchRanges { offsetMappings := make(map[int][]*matchRange) var matched matchRanges for _, tgtNode := range target.nodes { sr, ok := src.Hashes[tgtNode.checksum] if !ok { continue } tv := tgtNode.tokens for _, sv := range sr { offset := tv.Start - sv.Start if om, ok := offsetMappings[offset]; ok { // See if this extends the most recent existing mapping lastIdx := len(om) - 1 if om[lastIdx].TargetEnd == tv.End-1 { // This new value extends. Update the value in place om[lastIdx].SrcEnd = sv.End om[lastIdx].TargetEnd = tv.End continue } } offsetMappings[offset] = append(offsetMappings[offset], &matchRange{ SrcStart: sv.Start, SrcEnd: sv.End, TargetStart: tv.Start, TargetEnd: tv.End, }) } } // Compute the number of tokens claimed in each run and flatten into a single slice. for _, mr := range offsetMappings { for _, m := range mr { m.TokensClaimed = m.TargetEnd - m.TargetStart } matched = append(matched, mr...) } sort.Sort(matched) return matched } type hash map[uint32]tokenRanges func (h hash) add(checksum uint32, start, end int) { h[checksum] = append(h[checksum], &tokenRange{Start: start, End: end}) }