src/FlightCrew/Misc/Utilities.cpp

   1 /************************************************************************\r
   2 **\r
   3 **  Copyright (C) 2010  Strahinja Markovic\r
   4 **\r
   5 **  This file is part of FlightCrew.\r
   6 **\r
   7 **  FlightCrew is free software: you can redistribute it and/or modify\r
   8 **  it under the terms of the GNU Lesser General Public License as published\r
   9 **  by the Free Software Foundation, either version 3 of the License, or\r
  10 **  (at your option) any later version.\r
  11 **\r
  12 **  FlightCrew is distributed in the hope that it will be useful,\r
  13 **  but WITHOUT ANY WARRANTY; without even the implied warranty of\r
  14 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r
  15 **  GNU Lesser General Public License for more details.\r
  16 **\r
  17 **  You should have received a copy of the GNU Lesser General Public License\r
  18 **  along with FlightCrew.  If not, see <http://www.gnu.org/licenses/>.\r
  19 **\r
  20 *************************************************************************/\r
  21 \r
  22 #include <stdafx.h>\r
  23 #include "Utilities.h"\r
  24 #include <fstream>\r
  25 #include <utf8.h>\r
  26 #include <xercesc/util/TransService.hpp>\r
  27 #include <xercesc/dom/DOMDocument.hpp>\r
  28 #include <xercesc/framework/MemBufInputSource.hpp>\r
  29 #include <XmlUtils.h>\r
  30 #include <LocationAwareDOMParser.h>\r
  31 #include <boost/filesystem/detail/utf8_codecvt_facet.hpp>\r
  32 #include <ToXercesStringConverter.h>\r
  33 \r
  34 \r
  35 namespace FlightCrew\r
  36 {\r
  37 \r
  38 namespace Util\r
  39 {\r
  40 \r
  41 std::string ReadUnicodFile( const fs::path &filepath )\r
  42 {\r
  43     fs::ifstream file( filepath, std::ios::in | std::ios::binary );\r
  44 \r
  45     if ( !file.is_open() )\r
  46 \r
  47           boost_throw( FileDoesNotExistEx() << ei_FilePath( BoostPathToUtf8Path( filepath ) ) );\r
  48 \r
  49     std::vector< char > contents( (std::istreambuf_iterator< char>( file )), \r
  50                                    std::istreambuf_iterator< char>() );\r
  51 \r
  52     // May as well be empty\r
  53     if ( contents.size() < 2 )\r
  54     \r
  55         return std::string();\r
  56 \r
  57     if ( utf8::is_valid( contents.begin(), contents.end() ) )\r
  58 \r
  59         return std::string( contents.begin(), contents.end() );\r
  60 \r
  61     // UTF-16BE\r
  62     if ( static_cast< unsigned char >( contents[ 0 ] ) == 0xfeU &&\r
  63          static_cast< unsigned char >( contents[ 1 ] ) == 0xffU )\r
  64     {\r
  65         xc::TranscodeFromStr transcoder( \r
  66             (const XMLByte*) &( *contents.begin() ), contents.size() , "UTF-16BE" );\r
  67 \r
  68         xc::TranscodeToStr transcoder_utf8( transcoder.str(), "UTF-8" );\r
  69 \r
  70         return std::string( (char*) transcoder_utf8.str() );\r
  71     }\r
  72 \r
  73     // UTF-16LE\r
  74     else if ( static_cast< unsigned char >( contents[ 0 ] ) == 0xffU &&\r
  75               static_cast< unsigned char >( contents[ 1 ] ) == 0xfeU )\r
  76     {\r
  77         xc::TranscodeFromStr transcoder( \r
  78             (const XMLByte*) &( *contents.begin() ), contents.size(), "UTF-16LE" );\r
  79 \r
  80         xc::TranscodeToStr transcoder_utf8( transcoder.str(), "UTF-8" );\r
  81 \r
  82         return std::string( (char*) transcoder_utf8.str() );\r
  83     }\r
  84 \r
  85     else\r
  86     {\r
  87         boost_throw( FileNotInUnicodeEx() << ei_FilePath( filepath.generic_string() ) );\r
  88     }\r
  89 }\r
  90 \r
  91 \r
  92 std::string GetFirstNumChars( const std::string &string, uint num_chars )\r
  93 {\r
  94     if ( string.empty() )\r
  95 \r
  96         return std::string();\r
  97 \r
  98     uint string_size   = static_cast< unsigned int >( string.size() );\r
  99     uint chars_to_copy = string_size < num_chars ? string_size : num_chars;\r
 100 \r
 101     std::string::const_iterator it = string.begin();\r
 102     std::advance( it, chars_to_copy );\r
 103 \r
 104     std::string line;\r
 105     line.resize( num_chars );\r
 106 \r
 107     std::copy( string.begin(), it, line.begin() );\r
 108 \r
 109     return line;\r
 110 }\r
 111 \r
 112 \r
 113 std::string GetFirstNumCharsFromFile( const fs::path &filepath, uint num_chars )\r
 114 {\r
 115     try\r
 116     {\r
 117         // TODO: Let's not load the entire file\r
 118         std::string contents = Util::ReadUnicodFile( filepath );\r
 119         return GetFirstNumChars( contents, num_chars );\r
 120     }\r
 121 \r
 122     catch ( FileNotInUnicodeEx& )\r
 123     {\r
 124         return std::string();\r
 125     }  \r
 126 }\r
 127 \r
 128 \r
 129 int LineOfCharIndex( const std::string &string, unsigned int char_index )\r
 130 {\r
 131     // \x0A is the line feed char, \r
 132     // \x0D is the carriage return char\r
 133 \r
 134     std::string line_marker;\r
 135 \r
 136     if ( string.find( "\x0A" ) != std::string::npos )\r
 137 \r
 138         line_marker = "\x0A";\r
 139 \r
 140     else \r
 141 \r
 142         line_marker = "\x0D";\r
 143 \r
 144     size_t search_start = 0;\r
 145     int count = 1;\r
 146 \r
 147     while ( true )\r
 148     {\r
 149         size_t position = string.find( line_marker, search_start );\r
 150 \r
 151         if ( position == std::string::npos || position > char_index )\r
 152 \r
 153             break;\r
 154 \r
 155         ++count;        \r
 156         search_start = position + 1;\r
 157     }\r
 158 \r
 159     return count;\r
 160 }\r
 161 \r
 162 \r
 163 boost::shared_ptr< xc::DOMDocument > RaiiWrapDocument( xc::DOMDocument *document )\r
 164 {\r
 165     return boost::shared_ptr< xc::DOMDocument >( document, XercesExt::XercesDeallocator< xc::DOMDocument > );\r
 166 }\r
 167 \r
 168 \r
 169 boost::shared_ptr< xc::DOMDocument > LoadXmlDocument( const fs::path &filepath )\r
 170 {\r
 171     if ( filepath.empty() )\r
 172 \r
 173         boost_throw( XercesParsingError() );  \r
 174 \r
 175     xe::LocationAwareDOMParser parser;\r
 176 \r
 177     // This scanner ignores schemas and DTDs\r
 178     parser.useScanner( xc::XMLUni::fgWFXMLScanner );\r
 179     parser.setValidationScheme( xc::AbstractDOMParser::Val_Never );\r
 180     parser.setDoNamespaces( true );\r
 181 \r
 182     parser.parse( toX( BoostPathToUtf8Path( filepath ) ) );\r
 183 \r
 184     xc::DOMDocument *document = parser.adoptDocument();\r
 185 \r
 186     if ( !document )\r
 187 \r
 188         boost_throw( XercesParsingError() );        \r
 189 \r
 190     return RaiiWrapDocument( document );\r
 191 }\r
 192 \r
 193 \r
 194 boost::shared_ptr< xc::DOMDocument > LoadXhtmlDocument( const fs::path &filepath )\r
 195 {\r
 196     if ( filepath.empty() )\r
 197 \r
 198         boost_throw( XercesParsingError() );  \r
 199 \r
 200     xe::LocationAwareDOMParser parser;\r
 201 \r
 202     parser.setDoSchema(             false );\r
 203     parser.setLoadSchema(           false );\r
 204     parser.setSkipDTDValidation(    true  );\r
 205     parser.setDoNamespaces(         true  );\r
 206     parser.useCachedGrammarInParse( true  );\r
 207 \r
 208     parser.setValidationScheme( xc::AbstractDOMParser::Val_Never );\r
 209 \r
 210     // This scanner ignores schemas, but does use DTDs\r
 211     parser.useScanner( xc::XMLUni::fgDGXMLScanner );\r
 212 \r
 213     const xc::MemBufInputSource input( XHTML11_FLAT_DTD,\r
 214                                        XHTML11_FLAT_DTD_LEN,\r
 215                                        toX( XHTML11_FLAT_DTD_ID ) );\r
 216 \r
 217     parser.loadGrammar( input, xc::Grammar::DTDGrammarType, true ); \r
 218 \r
 219     parser.parse( toX( BoostPathToUtf8Path( filepath ) ) );\r
 220 \r
 221     xc::DOMDocument *document = parser.adoptDocument();\r
 222 \r
 223     if ( !document )\r
 224 \r
 225         boost_throw( XercesParsingError() );        \r
 226 \r
 227     return RaiiWrapDocument( document );\r
 228 }\r
 229 \r
 230 \r
 231 char CharFromTwoHex( std::string two_hex_chars )\r
 232 {\r
 233     std::istringstream stream( two_hex_chars );\r
 234     int int_value;\r
 235     stream >> std::hex >> int_value;\r
 236 \r
 237     return static_cast< char >( int_value );\r
 238 }\r
 239 \r
 240 \r
 241 std::string UrlDecode( const std::string &encoded_url )\r
 242 {\r
 243     std::string decoded;\r
 244     decoded.reserve( encoded_url.size() );\r
 245 \r
 246     uint i = 0;\r
 247     while ( i < encoded_url.size() )\r
 248     {\r
 249         if ( encoded_url[ i ] == '%' &&\r
 250              i + 2 < encoded_url.size() )\r
 251         {\r
 252             decoded += CharFromTwoHex( encoded_url.substr( i + 1, 2 ) );\r
 253             i += 3;            \r
 254         }\r
 255 \r
 256         else \r
 257         {\r
 258             decoded += encoded_url[ i ];\r
 259             ++i;\r
 260         }\r
 261     }\r
 262 \r
 263     return decoded;\r
 264 }\r
 265 \r
 266 \r
 267 std::string GetUrlFragment( const std::string &decoded_url )\r
 268 {\r
 269     int hash_location = static_cast< int >( decoded_url.find( '#' ) );\r
 270 \r
 271     if ( hash_location != -1 && \r
 272          hash_location + 1 < static_cast< int >( decoded_url.size() ) )\r
 273     {\r
 274         return decoded_url.substr( hash_location + 1, decoded_url.size() );\r
 275     }\r
 276     \r
 277     return std::string();\r
 278 }\r
 279 \r
 280 \r
 281 std::string UrlWithoutFragment( const std::string &decoded_url )\r
 282 {\r
 283     int hash_location = static_cast< int >( decoded_url.find( '#' ) );\r
 284 \r
 285     if ( hash_location != -1 )\r
 286     \r
 287         return decoded_url.substr( 0, hash_location );    \r
 288     \r
 289     return decoded_url;\r
 290 }\r
 291 \r
 292 \r
 293 std::string UrlWithoutFileScheme( const std::string &decoded_url )\r
 294 {\r
 295     if ( boost::starts_with( decoded_url, "file://" ) )\r
 296 \r
 297         return boost::erase_first_copy( decoded_url, "file://" );\r
 298 \r
 299     return decoded_url;\r
 300 }\r
 301 \r
 302 \r
 303 \r
 304 fs::path NormalizePath( const fs::path &filepath )\r
 305 {\r
 306     std::string path_string = BoostPathToUtf8Path( filepath );        \r
 307     boost::regex up_dir_regex( "[^/]+/\\.\\./" );\r
 308 \r
 309     while ( true )\r
 310     {\r
 311         std::string old_path = path_string;\r
 312         path_string = boost::erase_all_regex_copy( path_string, up_dir_regex );\r
 313 \r
 314         if ( path_string == old_path )\r
 315 \r
 316             break;\r
 317     }\r
 318 \r
 319     boost::regex current_dir_regex( "(?<=/)\\./" );\r
 320 \r
 321     while ( true )\r
 322     {\r
 323         std::string old_path = path_string;\r
 324         path_string = boost::erase_all_regex_copy( path_string, current_dir_regex );\r
 325 \r
 326         if ( path_string == old_path )\r
 327 \r
 328             break;\r
 329     }\r
 330 \r
 331     return Utf8PathToBoostPath( path_string );    \r
 332 }\r
 333 \r
 334 \r
 335 fs::path Utf8PathToBoostPath( const std::string &utf8_path )\r
 336 {\r
 337     if ( utf8_path.empty() )\r
 338 \r
 339         return fs::path();\r
 340 \r
 341     if ( !utf8::is_valid( utf8_path.begin(), utf8_path.end() ) )\r
 342         \r
 343         boost_throw( PathNotInUtf8() << ei_FilePath( utf8_path ) );    \r
 344 \r
 345     boost::filesystem::detail::utf8_codecvt_facet utf8facet;\r
 346     return fs::path( utf8_path, utf8facet );\r
 347 }\r
 348 \r
 349 \r
 350 std::string BoostPathToUtf8Path( const fs::path &filepath )\r
 351 {\r
 352     if ( filepath.empty() )\r
 353 \r
 354         return std::string();\r
 355 \r
 356     boost::filesystem::detail::utf8_codecvt_facet utf8facet;\r
 357     return filepath.generic_string( utf8facet );\r
 358 }\r
 359 \r
 360 \r
 361 // Taking by const ref and making a copy could be costly,\r
 362 // but you know what they say about premature optimization.\r
 363 // If the profiler ends up screaming at this, then we'll refactor.\r
 364 std::vector< Result > AddPathToResults( const std::vector< Result > &results, const fs::path &filepath )\r
 365 {\r
 366     std::vector< Result > mod_results = results;\r
 367 \r
 368     foreach( Result &result, mod_results )\r
 369     {\r
 370         if ( result.GetFilepath().empty() )\r
 371 \r
 372             result.SetFilepath( BoostPathToUtf8Path( filepath ) );\r
 373     }\r
 374 \r
 375     return mod_results;\r
 376 }\r
 377 \r
 378 \r
 379 } // namespace Util\r
 380 \r
 381 } // namespace FlightCrew\r