// Copyright 2017 The PDFium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com #include "core/fpdfapi/font/cpdf_tounicodemap.h" #include #include #include "core/fpdfapi/font/cpdf_cid2unicodemap.h" #include "core/fpdfapi/font/cpdf_fontglobals.h" #include "core/fpdfapi/parser/cpdf_simple_parser.h" #include "core/fpdfapi/parser/cpdf_stream.h" #include "core/fpdfapi/parser/fpdf_parser_utility.h" #include "core/fxcrt/fx_extension.h" #include "core/fxcrt/fx_safe_types.h" #include "third_party/base/containers/contains.h" #include "third_party/base/numerics/safe_conversions.h" namespace { WideString StringDataAdd(WideString str) { WideString ret; wchar_t value = 1; for (size_t i = str.GetLength(); i > 0; --i) { wchar_t ch = str[i - 1] + value; if (ch < str[i - 1]) { ret.InsertAtFront(0); } else { ret.InsertAtFront(ch); value = 0; } } if (value) ret.InsertAtFront(value); return ret; } } // namespace CPDF_ToUnicodeMap::CPDF_ToUnicodeMap(RetainPtr pStream) { Load(std::move(pStream)); } CPDF_ToUnicodeMap::~CPDF_ToUnicodeMap() = default; WideString CPDF_ToUnicodeMap::Lookup(uint32_t charcode) const { auto it = m_Multimap.find(charcode); if (it == m_Multimap.end()) { if (!m_pBaseMap) return WideString(); return WideString( m_pBaseMap->UnicodeFromCID(static_cast(charcode))); } uint32_t value = *it->second.begin(); wchar_t unicode = static_cast(value & 0xffff); if (unicode != 0xffff) return WideString(unicode); size_t index = value >> 16; return index < m_MultiCharVec.size() ? m_MultiCharVec[index] : WideString(); } uint32_t CPDF_ToUnicodeMap::ReverseLookup(wchar_t unicode) const { for (const auto& pair : m_Multimap) { if (pdfium::Contains(pair.second, static_cast(unicode))) return pair.first; } return 0; } size_t CPDF_ToUnicodeMap::GetUnicodeCountByCharcodeForTesting( uint32_t charcode) const { auto it = m_Multimap.find(charcode); return it != m_Multimap.end() ? it->second.size() : 0u; } // static absl::optional CPDF_ToUnicodeMap::StringToCode(ByteStringView input) { // Ignore whitespaces within `input`. See https://crbug.com/pdfium/2065. std::set seen_whitespace_chars; for (char c : input) { if (PDFCharIsWhitespace(c)) { seen_whitespace_chars.insert(c); } } ByteString str_without_whitespace_chars; // Must outlive `str`. ByteStringView str; if (seen_whitespace_chars.empty()) { str = input; } else { str_without_whitespace_chars.Reserve(input.GetLength()); for (char c : input) { if (!pdfium::Contains(seen_whitespace_chars, c)) { str_without_whitespace_chars += c; } } str = str_without_whitespace_chars.AsStringView(); } size_t len = str.GetLength(); if (len <= 2 || str[0] != '<' || str[len - 1] != '>') return absl::nullopt; FX_SAFE_UINT32 code = 0; for (char c : str.Substr(1, len - 2)) { if (!FXSYS_IsHexDigit(c)) return absl::nullopt; code = code * 16 + FXSYS_HexCharToInt(c); if (!code.IsValid()) return absl::nullopt; } return absl::optional(code.ValueOrDie()); } // static WideString CPDF_ToUnicodeMap::StringToWideString(ByteStringView str) { size_t len = str.GetLength(); if (len <= 2 || str[0] != '<' || str[len - 1] != '>') return WideString(); WideString result; int byte_pos = 0; wchar_t ch = 0; for (char c : str.Substr(1, len - 2)) { if (!FXSYS_IsHexDigit(c)) break; ch = ch * 16 + FXSYS_HexCharToInt(c); byte_pos++; if (byte_pos == 4) { result += ch; byte_pos = 0; ch = 0; } } return result; } void CPDF_ToUnicodeMap::Load(RetainPtr pStream) { CIDSet cid_set = CIDSET_UNKNOWN; auto pAcc = pdfium::MakeRetain(std::move(pStream)); pAcc->LoadAllDataFiltered(); CPDF_SimpleParser parser(pAcc->GetSpan()); while (true) { ByteStringView word = parser.GetWord(); if (word.IsEmpty()) break; if (word == "beginbfchar") HandleBeginBFChar(&parser); else if (word == "beginbfrange") HandleBeginBFRange(&parser); else if (word == "/Adobe-Korea1-UCS2") cid_set = CIDSET_KOREA1; else if (word == "/Adobe-Japan1-UCS2") cid_set = CIDSET_JAPAN1; else if (word == "/Adobe-CNS1-UCS2") cid_set = CIDSET_CNS1; else if (word == "/Adobe-GB1-UCS2") cid_set = CIDSET_GB1; } if (cid_set != CIDSET_UNKNOWN) { m_pBaseMap = CPDF_FontGlobals::GetInstance()->GetCID2UnicodeMap(cid_set); } } void CPDF_ToUnicodeMap::HandleBeginBFChar(CPDF_SimpleParser* pParser) { while (true) { ByteStringView word = pParser->GetWord(); if (word.IsEmpty() || word == "endbfchar") return; absl::optional code = StringToCode(word); if (!code.has_value()) return; SetCode(code.value(), StringToWideString(pParser->GetWord())); } } void CPDF_ToUnicodeMap::HandleBeginBFRange(CPDF_SimpleParser* pParser) { while (true) { ByteStringView lowcode_str = pParser->GetWord(); if (lowcode_str.IsEmpty() || lowcode_str == "endbfrange") return; absl::optional lowcode_opt = StringToCode(lowcode_str); if (!lowcode_opt.has_value()) return; ByteStringView highcode_str = pParser->GetWord(); absl::optional highcode_opt = StringToCode(highcode_str); if (!highcode_opt.has_value()) return; uint32_t lowcode = lowcode_opt.value(); uint32_t highcode = (lowcode & 0xffffff00) | (highcode_opt.value() & 0xff); ByteStringView start = pParser->GetWord(); if (start == "[") { for (FX_SAFE_UINT32 code = lowcode; code.IsValid() && code.ValueOrDie() <= highcode; code++) { SetCode(code.ValueOrDie(), StringToWideString(pParser->GetWord())); } pParser->GetWord(); continue; } WideString destcode = StringToWideString(start); if (destcode.GetLength() == 1) { absl::optional value_or_error = StringToCode(start); if (!value_or_error.has_value()) return; uint32_t value = value_or_error.value(); for (FX_SAFE_UINT32 code = lowcode; code.IsValid() && code.ValueOrDie() <= highcode; code++) { InsertIntoMultimap(code.ValueOrDie(), value++); } } else { for (FX_SAFE_UINT32 code = lowcode; code.IsValid() && code.ValueOrDie() <= highcode; code++) { uint32_t code_value = code.ValueOrDie(); WideString retcode = code_value == lowcode ? destcode : StringDataAdd(destcode); InsertIntoMultimap(code_value, GetMultiCharIndexIndicator()); m_MultiCharVec.push_back(retcode); destcode = std::move(retcode); } } } } uint32_t CPDF_ToUnicodeMap::GetMultiCharIndexIndicator() const { FX_SAFE_UINT32 uni = m_MultiCharVec.size(); uni = uni * 0x10000 + 0xffff; return uni.ValueOrDefault(0); } void CPDF_ToUnicodeMap::SetCode(uint32_t srccode, WideString destcode) { size_t len = destcode.GetLength(); if (len == 0) return; if (len == 1) { InsertIntoMultimap(srccode, destcode[0]); } else { InsertIntoMultimap(srccode, GetMultiCharIndexIndicator()); m_MultiCharVec.push_back(destcode); } } void CPDF_ToUnicodeMap::InsertIntoMultimap(uint32_t code, uint32_t destcode) { auto it = m_Multimap.find(code); if (it == m_Multimap.end()) { m_Multimap.emplace(code, std::set{destcode}); return; } it->second.emplace(destcode); }