1 /************************************************************************
\r
3 ** Copyright (C) 2010 Strahinja Markovic
\r
5 ** This file is part of FlightCrew.
\r
7 ** FlightCrew is free software: you can redistribute it and/or modify
\r
8 ** it under the terms of the GNU Lesser General Public License as published
\r
9 ** by the Free Software Foundation, either version 3 of the License, or
\r
10 ** (at your option) any later version.
\r
12 ** FlightCrew is distributed in the hope that it will be useful,
\r
13 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
14 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
15 ** GNU Lesser General Public License for more details.
\r
17 ** You should have received a copy of the GNU Lesser General Public License
\r
18 ** along with FlightCrew. If not, see <http://www.gnu.org/licenses/>.
\r
20 *************************************************************************/
\r
23 #include "Utilities.h"
\r
26 #include <xercesc/util/TransService.hpp>
\r
27 #include <xercesc/dom/DOMDocument.hpp>
\r
28 #include <xercesc/framework/MemBufInputSource.hpp>
\r
29 #include <XmlUtils.h>
\r
30 #include <LocationAwareDOMParser.h>
\r
31 #include <boost/filesystem/detail/utf8_codecvt_facet.hpp>
\r
32 #include <ToXercesStringConverter.h>
\r
35 namespace FlightCrew
\r
41 std::string ReadUnicodFile( const fs::path &filepath )
\r
43 fs::ifstream file( filepath, std::ios::in | std::ios::binary );
\r
45 if ( !file.is_open() )
\r
47 boost_throw( FileDoesNotExistEx() << ei_FilePath( BoostPathToUtf8Path( filepath ) ) );
\r
49 std::vector< char > contents( (std::istreambuf_iterator< char>( file )),
\r
50 std::istreambuf_iterator< char>() );
\r
52 // May as well be empty
\r
53 if ( contents.size() < 2 )
\r
55 return std::string();
\r
57 if ( utf8::is_valid( contents.begin(), contents.end() ) )
\r
59 return std::string( contents.begin(), contents.end() );
\r
62 if ( static_cast< unsigned char >( contents[ 0 ] ) == 0xfeU &&
\r
63 static_cast< unsigned char >( contents[ 1 ] ) == 0xffU )
\r
65 xc::TranscodeFromStr transcoder(
\r
66 (const XMLByte*) &( *contents.begin() ), contents.size() , "UTF-16BE" );
\r
68 xc::TranscodeToStr transcoder_utf8( transcoder.str(), "UTF-8" );
\r
70 return std::string( (char*) transcoder_utf8.str() );
\r
74 else if ( static_cast< unsigned char >( contents[ 0 ] ) == 0xffU &&
\r
75 static_cast< unsigned char >( contents[ 1 ] ) == 0xfeU )
\r
77 xc::TranscodeFromStr transcoder(
\r
78 (const XMLByte*) &( *contents.begin() ), contents.size(), "UTF-16LE" );
\r
80 xc::TranscodeToStr transcoder_utf8( transcoder.str(), "UTF-8" );
\r
82 return std::string( (char*) transcoder_utf8.str() );
\r
87 boost_throw( FileNotInUnicodeEx() << ei_FilePath( filepath.generic_string() ) );
\r
92 std::string GetFirstNumChars( const std::string &string, uint num_chars )
\r
94 if ( string.empty() )
\r
96 return std::string();
\r
98 uint string_size = static_cast< unsigned int >( string.size() );
\r
99 uint chars_to_copy = string_size < num_chars ? string_size : num_chars;
\r
101 std::string::const_iterator it = string.begin();
\r
102 std::advance( it, chars_to_copy );
\r
105 line.resize( num_chars );
\r
107 std::copy( string.begin(), it, line.begin() );
\r
113 std::string GetFirstNumCharsFromFile( const fs::path &filepath, uint num_chars )
\r
117 // TODO: Let's not load the entire file
\r
118 std::string contents = Util::ReadUnicodFile( filepath );
\r
119 return GetFirstNumChars( contents, num_chars );
\r
122 catch ( FileNotInUnicodeEx& )
\r
124 return std::string();
\r
129 int LineOfCharIndex( const std::string &string, unsigned int char_index )
\r
131 // \x0A is the line feed char,
\r
132 // \x0D is the carriage return char
\r
134 std::string line_marker;
\r
136 if ( string.find( "\x0A" ) != std::string::npos )
\r
138 line_marker = "\x0A";
\r
142 line_marker = "\x0D";
\r
144 size_t search_start = 0;
\r
149 size_t position = string.find( line_marker, search_start );
\r
151 if ( position == std::string::npos || position > char_index )
\r
156 search_start = position + 1;
\r
163 boost::shared_ptr< xc::DOMDocument > RaiiWrapDocument( xc::DOMDocument *document )
\r
165 return boost::shared_ptr< xc::DOMDocument >( document, XercesExt::XercesDeallocator< xc::DOMDocument > );
\r
169 boost::shared_ptr< xc::DOMDocument > LoadXmlDocument( const fs::path &filepath )
\r
171 if ( filepath.empty() )
\r
173 boost_throw( XercesParsingError() );
\r
175 xe::LocationAwareDOMParser parser;
\r
177 // This scanner ignores schemas and DTDs
\r
178 parser.useScanner( xc::XMLUni::fgWFXMLScanner );
\r
179 parser.setValidationScheme( xc::AbstractDOMParser::Val_Never );
\r
180 parser.setDoNamespaces( true );
\r
182 parser.parse( toX( BoostPathToUtf8Path( filepath ) ) );
\r
184 xc::DOMDocument *document = parser.adoptDocument();
\r
188 boost_throw( XercesParsingError() );
\r
190 return RaiiWrapDocument( document );
\r
194 boost::shared_ptr< xc::DOMDocument > LoadXhtmlDocument( const fs::path &filepath )
\r
196 if ( filepath.empty() )
\r
198 boost_throw( XercesParsingError() );
\r
200 xe::LocationAwareDOMParser parser;
\r
202 parser.setDoSchema( false );
\r
203 parser.setLoadSchema( false );
\r
204 parser.setSkipDTDValidation( true );
\r
205 parser.setDoNamespaces( true );
\r
206 parser.useCachedGrammarInParse( true );
\r
208 parser.setValidationScheme( xc::AbstractDOMParser::Val_Never );
\r
210 // This scanner ignores schemas, but does use DTDs
\r
211 parser.useScanner( xc::XMLUni::fgDGXMLScanner );
\r
213 const xc::MemBufInputSource input( XHTML11_FLAT_DTD,
\r
214 XHTML11_FLAT_DTD_LEN,
\r
215 toX( XHTML11_FLAT_DTD_ID ) );
\r
217 parser.loadGrammar( input, xc::Grammar::DTDGrammarType, true );
\r
219 parser.parse( toX( BoostPathToUtf8Path( filepath ) ) );
\r
221 xc::DOMDocument *document = parser.adoptDocument();
\r
225 boost_throw( XercesParsingError() );
\r
227 return RaiiWrapDocument( document );
\r
231 char CharFromTwoHex( std::string two_hex_chars )
\r
233 std::istringstream stream( two_hex_chars );
\r
235 stream >> std::hex >> int_value;
\r
237 return static_cast< char >( int_value );
\r
241 std::string UrlDecode( const std::string &encoded_url )
\r
243 std::string decoded;
\r
244 decoded.reserve( encoded_url.size() );
\r
247 while ( i < encoded_url.size() )
\r
249 if ( encoded_url[ i ] == '%' &&
\r
250 i + 2 < encoded_url.size() )
\r
252 decoded += CharFromTwoHex( encoded_url.substr( i + 1, 2 ) );
\r
258 decoded += encoded_url[ i ];
\r
267 std::string GetUrlFragment( const std::string &decoded_url )
\r
269 int hash_location = static_cast< int >( decoded_url.find( '#' ) );
\r
271 if ( hash_location != -1 &&
\r
272 hash_location + 1 < static_cast< int >( decoded_url.size() ) )
\r
274 return decoded_url.substr( hash_location + 1, decoded_url.size() );
\r
277 return std::string();
\r
281 std::string UrlWithoutFragment( const std::string &decoded_url )
\r
283 int hash_location = static_cast< int >( decoded_url.find( '#' ) );
\r
285 if ( hash_location != -1 )
\r
287 return decoded_url.substr( 0, hash_location );
\r
289 return decoded_url;
\r
293 std::string UrlWithoutFileScheme( const std::string &decoded_url )
\r
295 if ( boost::starts_with( decoded_url, "file://" ) )
\r
297 return boost::erase_first_copy( decoded_url, "file://" );
\r
299 return decoded_url;
\r
304 fs::path NormalizePath( const fs::path &filepath )
\r
306 std::string path_string = BoostPathToUtf8Path( filepath );
\r
307 boost::regex up_dir_regex( "[^/]+/\\.\\./" );
\r
311 std::string old_path = path_string;
\r
312 path_string = boost::erase_all_regex_copy( path_string, up_dir_regex );
\r
314 if ( path_string == old_path )
\r
319 boost::regex current_dir_regex( "(?<=/)\\./" );
\r
323 std::string old_path = path_string;
\r
324 path_string = boost::erase_all_regex_copy( path_string, current_dir_regex );
\r
326 if ( path_string == old_path )
\r
331 return Utf8PathToBoostPath( path_string );
\r
335 fs::path Utf8PathToBoostPath( const std::string &utf8_path )
\r
337 if ( utf8_path.empty() )
\r
341 if ( !utf8::is_valid( utf8_path.begin(), utf8_path.end() ) )
\r
343 boost_throw( PathNotInUtf8() << ei_FilePath( utf8_path ) );
\r
345 boost::filesystem::detail::utf8_codecvt_facet utf8facet;
\r
346 return fs::path( utf8_path, utf8facet );
\r
350 std::string BoostPathToUtf8Path( const fs::path &filepath )
\r
352 if ( filepath.empty() )
\r
354 return std::string();
\r
356 boost::filesystem::detail::utf8_codecvt_facet utf8facet;
\r
357 return filepath.generic_string( utf8facet );
\r
361 // Taking by const ref and making a copy could be costly,
\r
362 // but you know what they say about premature optimization.
\r
363 // If the profiler ends up screaming at this, then we'll refactor.
\r
364 std::vector< Result > AddPathToResults( const std::vector< Result > &results, const fs::path &filepath )
\r
366 std::vector< Result > mod_results = results;
\r
368 foreach( Result &result, mod_results )
\r
370 if ( result.GetFilepath().empty() )
\r
372 result.SetFilepath( BoostPathToUtf8Path( filepath ) );
\r
375 return mod_results;
\r
379 } // namespace Util
\r
381 } // namespace FlightCrew
\r