1 /************************************************************************
\r
3 ** Copyright (C) 2010 Strahinja Markovic
\r
5 ** This file is part of FlightCrew.
\r
7 ** FlightCrew is free software: you can redistribute it and/or modify
\r
8 ** it under the terms of the GNU Lesser General Public License as published
\r
9 ** by the Free Software Foundation, either version 3 of the License, or
\r
10 ** (at your option) any later version.
\r
12 ** FlightCrew is distributed in the hope that it will be useful,
\r
13 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
14 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
15 ** GNU Lesser General Public License for more details.
\r
17 ** You should have received a copy of the GNU Lesser General Public License
\r
18 ** along with FlightCrew. If not, see <http://www.gnu.org/licenses/>.
\r
20 *************************************************************************/
\r
23 #include "DetermineMimetype.h"
\r
24 #include "Utilities.h"
\r
26 namespace FlightCrew
\r
29 const std::string OEBPS_MIME = "application/oebps-package+xml";
\r
30 const std::string XHTML_MIME = "application/xhtml+xml";
\r
31 const std::string NCX_MIME = "application/x-dtbncx+xml";
\r
32 const std::string PNG_MIME = "image/png";
\r
33 const std::string GIF_MIME = "image/gif";
\r
34 const std::string JPEG_MIME = "image/jpeg";
\r
35 const std::string SVG_MIME = "image/svg+xml";
\r
36 const std::string DTBOOK_MIME = "application/x-dtbook+xml";
\r
37 const std::string CSS_MIME = "text/css";
\r
38 const std::string XML_MIME = "application/xml"; // used for out-of-line xml islands
\r
39 const std::string XPGT_MIME = "application/vnd.adobe-page-template+xml";
\r
40 const std::string OTF_MIME = "application/vnd.ms-opentype";
\r
42 // For the "correct" truetype font mimetype, see this link
\r
43 // http://mx.gw.com/pipermail/file/2009/000400.html
\r
44 // Apparently ISO/IEC JTC 1/SC34 are working on a new font top-level medatype. But on
\r
45 // the other hand they also recognize "application/x-font-ttf" as being the
\r
46 // experimental (read: not standardized) defacto MIME type for Truetype fonts.
\r
47 // Number of Google hits for all three possibilities:
\r
48 // "application/x-truetype-font" 2100
\r
49 // "application/x-font-truetype" 4100
\r
50 // "application/x-font-ttf" 45900
\r
52 // So "application/x-font-ttf" it is.
\r
53 const std::string TTF_MIME = "application/x-font-ttf";
\r
54 const std::string OEB_DOC_MIME = "text/x-oeb1-document";
\r
55 const std::string OEB_CSS_MIME = "text/x-oeb1-css";
\r
57 const std::string UNKNOWN_MIME = "unknown";
\r
59 // It's just an arbitrary num of starting chars
\r
60 // that we search for a fingerprint. Things like
\r
61 // "<html>" should appear in this small section.
\r
62 static const uint NUM_CHARS_FOR_FINGERPRINT = 1000;
\r
63 static const boost::regex HTML_TAG_REGEX( "<\\s*html[^>]*>" );
\r
65 static const std::string NCX_SYSTEM_ID = "-//NISO//DTD ncx 2005-1//EN";
\r
66 static const std::string DTBOOK_SYSTEM_ID = "-//NISO//DTD dtbook 2005-1//EN";
\r
68 static const boost::regex NCX_TAG_REGEX(
\r
69 "<[^>]*ncx[^>]*\"http://www.daisy.org/z3986/2005/ncx/\"[^>]*>" );
\r
71 static const boost::regex XPGT_TEMPLATE_REGEX(
\r
72 "<[^>]*template[^>]*\"http://ns.adobe.com/2006/ade\"[^>]*>" );
\r
74 static const boost::regex DTBOOK_TAG_REGEX(
\r
75 "<[^>]*dtbook[^>]*\"http://www.daisy.org/z3986/2005/dtbook/\"[^>]*>" );
\r
78 std::string MimetypeFromExtension( const fs::path &filepath )
\r
80 std::string extension = Util::BoostPathToUtf8Path( filepath.extension() );
\r
81 boost::erase_first( extension, "." );
\r
83 if ( extension == "xhtml" ||
\r
84 extension == "html" ||
\r
85 extension == "htm" )
\r
87 // Only the xhtml mimetype is valid
\r
88 // within epub, "text/html" is not
\r
92 if ( extension == "png" )
\r
96 if ( extension == "gif" )
\r
100 if ( extension == "jpg" ||
\r
101 extension == "jpeg" )
\r
106 if ( extension == "css" )
\r
110 if ( extension == "ncx" )
\r
114 if ( extension == "svg" )
\r
118 if ( extension == "otf" )
\r
122 if ( extension == "ttf" )
\r
126 // We don't check for "xml" because
\r
127 // that's commonly used for several things.
\r
129 return UNKNOWN_MIME;
\r
133 bool HasHtmlFingerprint( const std::string &contents )
\r
135 return boost::regex_search( contents, HTML_TAG_REGEX );
\r
139 bool HasDtbookFingerprint( const std::string &contents )
\r
142 boost::contains( contents, DTBOOK_SYSTEM_ID ) ||
\r
143 boost::regex_search( contents, DTBOOK_TAG_REGEX );
\r
147 bool HasNcxFingerprint( const std::string &contents )
\r
150 boost::contains( contents, NCX_SYSTEM_ID ) ||
\r
151 boost::regex_search( contents, NCX_TAG_REGEX );
\r
155 bool HasXpgtFingerprint( const std::string &contents )
\r
157 return boost::regex_search( contents, XPGT_TEMPLATE_REGEX );
\r
161 std::string GuessMimetypeFromFileContents( const fs::path &filepath )
\r
163 std::string contents;
\r
167 contents = Util::ReadUnicodFile( filepath );
\r
170 catch ( std::exception& )
\r
172 return UNKNOWN_MIME;
\r
175 std::string contents_start = Util::GetFirstNumChars( contents, NUM_CHARS_FOR_FINGERPRINT );
\r
177 if ( HasHtmlFingerprint( contents_start ) )
\r
181 if ( HasNcxFingerprint( contents_start ) )
\r
183 return DTBOOK_MIME;
\r
185 if ( HasNcxFingerprint( contents_start ) )
\r
189 if ( HasXpgtFingerprint( contents_start ) )
\r
193 return UNKNOWN_MIME;
\r
197 std::string DetermineMimetype( const fs::path &filepath )
\r
199 std::string mimetype = MimetypeFromExtension( filepath );
\r
201 if ( mimetype != UNKNOWN_MIME )
\r
205 return GuessMimetypeFromFileContents( filepath );
\r
209 } // namespace FlightCrew
\r