1 /************************************************************************
\r
3 ** Copyright (C) 2010 Strahinja Markovic
\r
5 ** This file is part of FlightCrew.
\r
7 ** FlightCrew is free software: you can redistribute it and/or modify
\r
8 ** it under the terms of the GNU Lesser General Public License as published
\r
9 ** by the Free Software Foundation, either version 3 of the License, or
\r
10 ** (at your option) any later version.
\r
12 ** FlightCrew is distributed in the hope that it will be useful,
\r
13 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
14 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
15 ** GNU Lesser General Public License for more details.
\r
17 ** You should have received a copy of the GNU Lesser General Public License
\r
18 ** along with FlightCrew. If not, see <http://www.gnu.org/licenses/>.
\r
20 *************************************************************************/
\r
23 #include "ReachabilityAnalysis.h"
\r
24 #include <ToXercesStringConverter.h>
\r
25 #include <FromXercesStringConverter.h>
\r
26 #include <XmlUtils.h>
\r
27 #include "Misc/DetermineMimetype.h"
\r
28 #include "Misc/Utilities.h"
\r
33 namespace filesystem3
\r
35 // This overload of the boost hash_value func
\r
36 // is necessary so that we can put fs::paths
\r
37 // in boost::unordered_sets
\r
38 std::size_t hash_value( const fs::path &mypath )
\r
41 boost::hash< std::wstring > hasher;
\r
42 return hasher( mypath.generic_wstring() );
\r
44 boost::hash< std::string > hasher;
\r
45 return hasher( mypath.generic_string() );
\r
48 } // namespace filesystem3
\r
50 } // namespace boost
\r
53 namespace FlightCrew
\r
57 std::vector< Result > ReachabilityAnalysis::ValidateXml(
\r
58 const xc::DOMDocument &document,
\r
59 const fs::path &filepath )
\r
61 const fs::path opf_folder_path = filepath.parent_path();
\r
63 boost::unordered_map< std::string, fs::path > manifest_items =
\r
64 GetManifestItems( document, opf_folder_path );
\r
66 boost::unordered_set< fs::path > starting_set =
\r
67 StartingSetOpsPaths( document, manifest_items, opf_folder_path );
\r
69 boost::unordered_set< fs::path > reachable_resources =
\r
70 DetermineReachableResources( starting_set );
\r
72 std::vector< Result > results;
\r
74 Util::Extend( results, ResultsForOpsDocsNotInSpine( document, manifest_items, reachable_resources ) );
\r
75 Util::Extend( results, ResultsForResourcesNotInManifest( manifest_items, reachable_resources ) );
\r
76 Util::Extend( results, ResultsForUnusedResources( manifest_items, reachable_resources ) );
\r
82 std::vector< Result > ReachabilityAnalysis::ResultsForOpsDocsNotInSpine(
\r
83 const xc::DOMDocument &document,
\r
84 const boost::unordered_map< std::string, fs::path > &manifest_items,
\r
85 const boost::unordered_set< fs::path > &reachable_resources )
\r
87 boost::unordered_set< fs::path > spine_paths = SpinePaths( document, manifest_items );
\r
88 boost::unordered_set< fs::path > ops_docs = GetOnlyOpsDocs( reachable_resources );
\r
90 std::vector< Result > results;
\r
92 foreach( const fs::path &ops_path, ops_docs )
\r
94 if ( !spine_paths.count( ops_path ) )
\r
97 Result( ERROR_OPF_REACHABLE_OPS_DOC_NOT_IN_SPINE )
\r
98 .SetFilepath( Util::BoostPathToUtf8Path( ops_path ) )
\r
107 std::vector< Result > ReachabilityAnalysis::ResultsForResourcesNotInManifest(
\r
108 const boost::unordered_map< std::string, fs::path > &manifest_items,
\r
109 const boost::unordered_set< fs::path > &reachable_resources )
\r
111 boost::unordered_set< fs::path > manifest_paths =
\r
112 GetPathsFromItems( manifest_items );
\r
114 std::vector< Result > results;
\r
116 foreach( const fs::path &resource_path, reachable_resources )
\r
118 if ( !manifest_paths.count( resource_path ) )
\r
121 Result( ERROR_OPF_REACHABLE_RESOURCE_NOT_IN_MANIFEST )
\r
122 .SetFilepath( Util::BoostPathToUtf8Path( resource_path ) )
\r
131 std::vector< Result > ReachabilityAnalysis::ResultsForUnusedResources(
\r
132 const boost::unordered_map< std::string, fs::path > &manifest_items,
\r
133 const boost::unordered_set< fs::path > &reachable_resources )
\r
135 std::vector< Result > results;
\r
137 boost::unordered_set< fs::path > manifest_paths =
\r
138 GetPathsFromItems( manifest_items );
\r
140 foreach( const fs::path &manifest_path, manifest_paths )
\r
142 if ( !reachable_resources.count( manifest_path ) &&
\r
143 !AllowedToBeNotReachable( manifest_path ) )
\r
146 Result( WARNING_OPF_RESOURCE_IN_MANIFEST_NOT_REACHABLE )
\r
147 .SetFilepath( Util::BoostPathToUtf8Path( manifest_path ) )
\r
156 bool ReachabilityAnalysis::AllowedToBeNotReachable( const fs::path &filepath )
\r
158 // As per spec, the only file that is allowed to be unreachable is the NCX file.
\r
159 return DetermineMimetype( filepath ) == NCX_MIME;
\r
163 boost::unordered_map< std::string, fs::path > ReachabilityAnalysis::GetManifestItems(
\r
164 const xc::DOMDocument &document,
\r
165 const fs::path &opf_folder_path )
\r
167 boost::unordered_map< std::string, fs::path > manifest_items;
\r
169 std::vector< xc::DOMElement* > items = xe::GetElementsByQName(
\r
170 document, QName( "item", OPF_XML_NAMESPACE ) );
\r
172 foreach( xc::DOMElement* item, items )
\r
174 std::string id = fromX( item->getAttribute( toX( "id" ) ) );
\r
175 std::string href = fromX( item->getAttribute( toX( "href" ) ) );
\r
176 fs::path item_path = opf_folder_path /
\r
177 Util::Utf8PathToBoostPath( Util::UrlDecode( href ) );
\r
179 manifest_items[ id ] = item_path;
\r
182 return manifest_items;
\r
186 boost::unordered_set< fs::path > ReachabilityAnalysis::StartingSetOpsPaths(
\r
187 const xc::DOMDocument &document,
\r
188 const boost::unordered_map< std::string, fs::path > &manifest_items,
\r
189 const fs::path &opf_folder_path )
\r
191 boost::unordered_set< fs::path > starting_set = SpinePaths( document, manifest_items );
\r
192 starting_set = Util::SetUnion( starting_set, GuidePaths( document, opf_folder_path ) );
\r
193 starting_set = Util::SetUnion( starting_set, ToursPaths( document, opf_folder_path ) );
\r
194 starting_set = Util::SetUnion( starting_set, NcxPaths( document, manifest_items ) );
\r
196 return starting_set;
\r
200 boost::unordered_set< fs::path > ReachabilityAnalysis::SpinePaths(
\r
201 const xc::DOMDocument &document,
\r
202 const boost::unordered_map< std::string, fs::path > &manifest_items )
\r
204 boost::unordered_set< fs::path > spine_paths;
\r
206 std::vector< xc::DOMElement* > items = xe::GetElementsByQName(
\r
207 document, QName( "itemref", OPF_XML_NAMESPACE ) );
\r
209 foreach( xc::DOMElement* item, items )
\r
211 std::string idref = fromX( item->getAttribute( toX( "idref" ) ) );
\r
213 if ( manifest_items.count( idref ) > 0 )
\r
215 spine_paths.insert( manifest_items.at( idref ) );
\r
218 return spine_paths;
\r
222 boost::unordered_set< fs::path > ReachabilityAnalysis::GuidePaths(
\r
223 const xc::DOMDocument &document,
\r
224 const fs::path &opf_folder_path )
\r
226 boost::unordered_set< fs::path > guide_paths;
\r
228 std::vector< xc::DOMElement* > references = xe::GetElementsByQName(
\r
229 document, QName( "reference", OPF_XML_NAMESPACE ) );
\r
231 foreach( xc::DOMElement* reference, references )
\r
233 std::string href = fromX( reference->getAttribute( toX( "href" ) ) );
\r
234 fs::path reference_path = opf_folder_path /
\r
235 Util::Utf8PathToBoostPath( Util::UrlWithoutFragment( Util::UrlDecode( href ) ) );
\r
237 guide_paths.insert( reference_path );
\r
240 return guide_paths;
\r
244 boost::unordered_set< fs::path > ReachabilityAnalysis::ToursPaths(
\r
245 const xc::DOMDocument &document,
\r
246 const fs::path &opf_folder_path )
\r
248 boost::unordered_set< fs::path > tours_paths;
\r
250 std::vector< xc::DOMElement* > sites = xe::GetElementsByQName(
\r
251 document, QName( "site ", OPF_XML_NAMESPACE ) );
\r
253 foreach( xc::DOMElement* site, sites )
\r
255 std::string href = fromX( site->getAttribute( toX( "href" ) ) );
\r
256 fs::path site_path = opf_folder_path /
\r
257 Util::Utf8PathToBoostPath( Util::UrlWithoutFragment( Util::UrlDecode( href ) ) );
\r
259 tours_paths.insert( site_path );
\r
262 return tours_paths;
\r
266 fs::path ReachabilityAnalysis::GetPathToNcx(
\r
267 const xc::DOMDocument &document,
\r
268 const boost::unordered_map< std::string, fs::path > &manifest_items )
\r
270 std::vector< xc::DOMAttr* > tocs = xe::GetAllAttributesFromElements(
\r
271 QName( "spine", OPF_XML_NAMESPACE ),
\r
272 QName( "toc", "" ),
\r
275 if ( tocs.empty() )
\r
279 std::string toc_id = fromX( tocs[ 0 ]->getValue() );
\r
281 if ( !manifest_items.count( toc_id ) )
\r
285 return manifest_items.at( toc_id );
\r
289 boost::unordered_set< fs::path > ReachabilityAnalysis::NcxPaths(
\r
290 const xc::DOMDocument &document,
\r
291 const boost::unordered_map< std::string, fs::path > &manifest_items )
\r
293 fs::path ncx_path = GetPathToNcx( document, manifest_items );
\r
294 boost::shared_ptr< xc::DOMDocument > ncx_document;
\r
298 ncx_document = Util::LoadXmlDocument( ncx_path );
\r
301 catch ( std::exception& )
\r
303 // If the file doesn't exist or some other
\r
304 // snafu, then there are obviously no links.
\r
305 return boost::unordered_set< fs::path > ();
\r
308 boost::unordered_set< fs::path > ncx_paths;
\r
310 std::vector< xc::DOMAttr* > srcs = xe::GetAllAttributesFromElements(
\r
311 QName( "content", NCX_XML_NAMESPACE ),
\r
312 QName( "src", "" ),
\r
315 fs::path ncx_folder = ncx_path.parent_path();
\r
317 foreach( xc::DOMAttr* src, srcs )
\r
319 fs::path resource_path =
\r
320 Util::Utf8PathToBoostPath(
\r
321 Util::UrlWithoutFragment(
\r
322 Util::UrlDecode( fromX( src->getValue() ) ) ) );
\r
324 ncx_paths.insert( ncx_folder / resource_path );
\r
331 boost::unordered_set< fs::path > ReachabilityAnalysis::DetermineReachableResources(
\r
332 const boost::unordered_set< fs::path > &starting_ops_paths )
\r
334 boost::unordered_set< fs::path > current_resource_set = starting_ops_paths;
\r
335 boost::unordered_set< fs::path > new_resource_set = current_resource_set;
\r
339 boost::unordered_set< fs::path > reachable_resource_set =
\r
340 GetDirectlyReachableResources( new_resource_set );
\r
342 boost::unordered_set< fs::path > next_resource_set =
\r
343 Util::SetUnion( reachable_resource_set, current_resource_set );
\r
345 if ( next_resource_set == current_resource_set )
\r
349 new_resource_set = Util::SetSubtraction( next_resource_set, current_resource_set );
\r
350 current_resource_set = next_resource_set;
\r
353 return current_resource_set;
\r
357 boost::unordered_set< fs::path > ReachabilityAnalysis::GetDirectlyReachableResources(
\r
358 const boost::unordered_set< fs::path > &resources )
\r
360 return Util::SetUnion(
\r
361 GetLinkedResourcesFromAllOps( GetOnlyOpsDocs( resources ) ),
\r
362 GetLinkedResourcesFromAllCss( GetOnlyCssDocs( resources ) ) );
\r
366 boost::unordered_set< fs::path > ReachabilityAnalysis::GetOnlyOpsDocs(
\r
367 const boost::unordered_set< fs::path > &resources )
\r
369 boost::unordered_set< fs::path > ops_docs;
\r
371 foreach( const fs::path &resource, resources )
\r
373 std::string mimetype = DetermineMimetype( resource );
\r
375 if ( mimetype == XHTML_MIME ||
\r
376 mimetype == DTBOOK_MIME ||
\r
377 mimetype == OEB_DOC_MIME )
\r
379 ops_docs.insert( resource );
\r
387 boost::unordered_set< fs::path > ReachabilityAnalysis::GetOnlyCssDocs(
\r
388 const boost::unordered_set< fs::path > &resources )
\r
390 boost::unordered_set< fs::path > ops_docs;
\r
392 foreach( const fs::path &resource, resources )
\r
394 std::string mimetype = DetermineMimetype( resource );
\r
396 if ( mimetype == CSS_MIME )
\r
398 ops_docs.insert( resource );
\r
406 boost::unordered_set< fs::path > ReachabilityAnalysis::GetLinkedResourcesFromAllOps(
\r
407 const boost::unordered_set< fs::path > &ops_docs )
\r
409 boost::unordered_set< fs::path > all_linked_resources;
\r
411 foreach( const fs::path &ops_doc, ops_docs )
\r
413 all_linked_resources = Util::SetUnion(
\r
414 all_linked_resources, GetLinkedResourcesFromOps( ops_doc ) );
\r
417 return all_linked_resources;
\r
421 boost::unordered_set< fs::path > ReachabilityAnalysis::GetLinkedResourcesFromAllCss(
\r
422 const boost::unordered_set< fs::path > &css_docs )
\r
424 boost::unordered_set< fs::path > all_linked_resources;
\r
426 foreach( const fs::path &css_doc, css_docs )
\r
428 all_linked_resources = Util::SetUnion(
\r
429 all_linked_resources, GetLinkedResourcesFromCss( css_doc ) );
\r
432 return all_linked_resources;
\r
436 boost::unordered_set< fs::path > ReachabilityAnalysis::GetLinkedResourcesFromOps(
\r
437 const fs::path &ops_document )
\r
439 boost::shared_ptr< xc::DOMDocument > document;
\r
443 document = Util::LoadXhtmlDocument( ops_document );
\r
446 catch ( std::exception& )
\r
448 // If the file doesn't exist or some other
\r
449 // snafu, then there are obviously no links.
\r
450 return boost::unordered_set< fs::path > ();
\r
453 xc::DOMNodeList *elements = document->getElementsByTagNameNS(
\r
454 toX( "*" ), toX( "*" ) );
\r
456 boost::unordered_set< fs::path > linked_resources;
\r
457 fs::path ops_doc_folder = ops_document.parent_path();
\r
459 for ( uint i = 0; i < elements->getLength(); ++i )
\r
461 xc::DOMNamedNodeMap *attribute_map = elements->item( i )->getAttributes();
\r
463 if ( !attribute_map )
\r
467 for ( uint j = 0; j < attribute_map->getLength(); ++j )
\r
469 xc::DOMAttr *attribute = static_cast< xc::DOMAttr* >( attribute_map->item( j ) );
\r
470 std::string attribute_name = fromX( attribute->getLocalName() );
\r
472 if ( attribute_name == "href" ||
\r
473 attribute_name == "src" )
\r
475 std::string attribute_value = fromX( attribute->getValue() );
\r
476 fs::path resource_path =
\r
477 Util::Utf8PathToBoostPath(
\r
478 Util::UrlWithoutFileScheme(
\r
479 Util::UrlWithoutFragment(
\r
480 Util::UrlDecode( attribute_value ) ) ) );
\r
482 if ( !IsFilesystemPath( resource_path ) || resource_path.empty() )
\r
486 linked_resources.insert( Util::NormalizePath( ops_doc_folder / resource_path ) );
\r
491 return linked_resources;
\r
495 boost::unordered_set< fs::path > ReachabilityAnalysis::GetLinkedResourcesFromCss(
\r
496 const fs::path &css_document )
\r
498 std::string contents;
\r
502 contents = Util::ReadUnicodFile( css_document );
\r
505 catch ( std::exception& )
\r
507 // If the file doesn't exist or some other
\r
508 // snafu, then there are obviously no links.
\r
509 return boost::unordered_set< fs::path > ();
\r
512 boost::unordered_set< fs::path > linked_resources;
\r
513 fs::path css_doc_folder = css_document.parent_path();
\r
515 // We have to erase all comments first, because we don't want
\r
516 // to count commented-out links.
\r
517 boost::erase_all_regex( contents, boost::regex( "/\\*.*?\\*/" ) );
\r
519 std::string::const_iterator start = contents.begin();
\r
520 std::string::const_iterator end = contents.end();
\r
522 boost::match_results< std::string::const_iterator > matches;
\r
523 boost::regex expression(
\r
524 "(?:(?:src|background|background-image)\\s*:|@import)\\s*"
\r
527 "url\\([\"']?([^\\)\"']+)[\"']?\\)"
\r
529 "[\"']([^\"']+)[\"']"
\r
534 while ( boost::regex_search( start, end, matches, expression ) )
\r
536 start = matches[ 0 ].second;
\r
538 for ( uint i = 1; i < matches.size(); ++i )
\r
540 std::string matched_path = matches[ i ];
\r
541 boost::trim( matched_path );
\r
543 if ( matched_path.empty() )
\r
547 fs::path resource_path = Util::Utf8PathToBoostPath( matched_path );
\r
548 linked_resources.insert( Util::NormalizePath( css_doc_folder / resource_path ) );
\r
552 return linked_resources;
\r
556 boost::unordered_set< fs::path > ReachabilityAnalysis::GetPathsFromItems(
\r
557 const boost::unordered_map< std::string, fs::path > &manifest_items )
\r
559 boost::unordered_set< fs::path > manifest_paths;
\r
561 // Using boost_foreach gives us a warning here,
\r
562 // so we use the normal for loop
\r
563 for ( boost::unordered_map< std::string, fs::path >::const_iterator it = manifest_items.begin();
\r
564 it != manifest_items.end(); ++it )
\r
566 manifest_paths.insert( it->second );
\r
569 return manifest_paths;
\r
573 bool ReachabilityAnalysis::IsFilesystemPath( const fs::path &path )
\r
575 // If the attribute value in a href has ':', it's because
\r
576 // this is a non-filesystem path. We already removed the
\r
577 // "file://" prefix if it existed.
\r
579 return path.string().find( ':' ) == std::string::npos;
\r
583 } // namespace FlightCrew
\r