e74dfcb
Description: Try to reduce confusion around docx files
e74dfcb
 Now also checks for XML files and HTML files
e74dfcb
Author: Olly Betts <olly@survex.com>
e74dfcb
Bug-Debian: https://bugs.debian.org/758959
e74dfcb
Bug-Debian: https://bugs.debian.org/791532
e74dfcb
Forwarded: no
e74dfcb
Last-Update: 2015-01-11
e74dfcb
e74dfcb
--- a/Docs/antiword.1
e74dfcb
+++ b/Docs/antiword.1
e74dfcb
@@ -14,7 +14,11 @@
e74dfcb
 .br
e74dfcb
 A wordfile named - stands for a Word document read from the standard input.
e74dfcb
 .br
e74dfcb
-Only documents made by MS Word version 2 and version 6 or later are supported.
e74dfcb
+Only the binary format documents made by MS Word version 2, 6, 7, 97, 2000 and
e74dfcb
+2003 are supported.  Newer Word versions default to using a completely
e74dfcb
+different format consisting of XML files in a ZIP container (usually with a
e74dfcb
+".docx" file extension) which antiword doesn't support.  It also doesn't
e74dfcb
+support the "flat" XML format which MS Word 2003 supported.
e74dfcb
 .SH OPTIONS
e74dfcb
 .TP
e74dfcb
 .BI "\-a " papersize
e74dfcb
--- a/antiword.h
e74dfcb
+++ b/antiword.h
e74dfcb
@@ -695,6 +695,9 @@
e74dfcb
 extern BOOL	bIsWordForDosFile(FILE *, long);
e74dfcb
 extern BOOL	bIsRtfFile(FILE *);
e74dfcb
 extern BOOL	bIsWordPerfectFile(FILE *);
e74dfcb
+extern BOOL	bIsZipFile(FILE *);
e74dfcb
+extern BOOL	bIsXMLFile(FILE *);
e74dfcb
+extern BOOL	bIsHTMLFile(FILE *);
e74dfcb
 extern BOOL	bIsWinWord12File(FILE *, long);
e74dfcb
 extern BOOL	bIsMacWord45File(FILE *);
e74dfcb
 extern int	iGuessVersionNumber(FILE *, long);
e74dfcb
--- a/main_u.c
e74dfcb
+++ b/main_u.c
e74dfcb
@@ -187,10 +187,29 @@
e74dfcb
 			werr(0, "%s is not a Word Document."
e74dfcb
 				" It is probably a Rich Text Format file",
e74dfcb
 				szFilename);
e74dfcb
-		} if (bIsWordPerfectFile(pFile)) {
e74dfcb
+		} else if (bIsWordPerfectFile(pFile)) {
e74dfcb
 			werr(0, "%s is not a Word Document."
e74dfcb
 				" It is probably a Word Perfect file",
e74dfcb
 				szFilename);
e74dfcb
+		} else if (bIsZipFile(pFile)) {
e74dfcb
+			werr(0, "%s is not a Word Document."
e74dfcb
+				" It seems to be a ZIP file, so is probably"
e74dfcb
+				" an OpenDocument file, or a \"docx\" file"
e74dfcb
+				" from MS Word 2007 or newer"
e74dfcb
+				" (antiword only handles binary format"
e74dfcb
+				" documents from MS Word 2003 and earlier)",
e74dfcb
+				szFilename);
e74dfcb
+		} else if (bIsXMLFile(pFile)) {
e74dfcb
+			werr(0, "%s is not a Word Document."
e74dfcb
+				" It seems to be an XML file, perhaps"
e74dfcb
+				" the XML format from MS Word 2003"
e74dfcb
+				" (antiword only handles binary format"
e74dfcb
+				" documents from MS Word 2003 and earlier)",
e74dfcb
+				szFilename);
e74dfcb
+		} else if (bIsHTMLFile(pFile)) {
e74dfcb
+			werr(0, "%s is not a Word Document."
e74dfcb
+				" It is probably an HTML file",
e74dfcb
+				szFilename);
e74dfcb
 		} else {
e74dfcb
 #if defined(__dos)
e74dfcb
 			werr(0, "%s is not a Word Document or the filename"
e74dfcb
--- a/wordlib.c
e74dfcb
+++ b/wordlib.c
e74dfcb
@@ -41,7 +41,7 @@
e74dfcb
 BOOL
e74dfcb
 bIsWordForDosFile(FILE *pFile, long lFilesize)
e74dfcb
 {
e74dfcb
-	static UCHAR	aucBytes[] =
e74dfcb
+	static const UCHAR	aucBytes[] =
e74dfcb
 		{ 0x31, 0xbe, 0x00, 0x00, 0x00, 0xab };	/* Word for DOS */
e74dfcb
 
e74dfcb
 	DBG_MSG("bIsWordForDosFile");
e74dfcb
@@ -64,7 +64,7 @@
e74dfcb
 static BOOL
e74dfcb
 bIsWordFileWithOLE(FILE *pFile, long lFilesize)
e74dfcb
 {
e74dfcb
-	static UCHAR	aucBytes[] =
e74dfcb
+	static const UCHAR	aucBytes[] =
e74dfcb
 		{ 0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1 };
e74dfcb
 	int	iTailLen;
e74dfcb
 
e74dfcb
@@ -108,7 +108,7 @@
e74dfcb
 BOOL
e74dfcb
 bIsRtfFile(FILE *pFile)
e74dfcb
 {
e74dfcb
-	static UCHAR	aucBytes[] =
e74dfcb
+	static const UCHAR	aucBytes[] =
e74dfcb
 		{ '{', '\\', 'r', 't', 'f', '1' };
e74dfcb
 
e74dfcb
 	DBG_MSG("bIsRtfFile");
e74dfcb
@@ -122,7 +122,7 @@
e74dfcb
 BOOL
e74dfcb
 bIsWordPerfectFile(FILE *pFile)
e74dfcb
 {
e74dfcb
-	static UCHAR	aucBytes[] =
e74dfcb
+	static const UCHAR	aucBytes[] =
e74dfcb
 		{ 0xff, 'W', 'P', 'C' };
e74dfcb
 
e74dfcb
 	DBG_MSG("bIsWordPerfectFile");
e74dfcb
@@ -131,13 +131,65 @@
e74dfcb
 } /* end of bIsWordPerfectFile */
e74dfcb
 
e74dfcb
 /*
e74dfcb
+ * This function checks whether the given file is or is not a ZIP file
e74dfcb
+ */
e74dfcb
+BOOL
e74dfcb
+bIsZipFile(FILE *pFile)
e74dfcb
+{
e74dfcb
+	static const UCHAR	aucBytes[] =
e74dfcb
+		{ 'P', 'K', 0x03, 0x04 };
e74dfcb
+
e74dfcb
+	DBG_MSG("bIsZipFile");
e74dfcb
+
e74dfcb
+	return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
e74dfcb
+} /* end of bIsZipFile */
e74dfcb
+
e74dfcb
+/*
e74dfcb
+ * This function checks whether the given file is or is not a XML file
e74dfcb
+ */
e74dfcb
+BOOL
e74dfcb
+bIsXMLFile(FILE *pFile)
e74dfcb
+{
e74dfcb
+	static const UCHAR	aucBytes[] =
e74dfcb
+		{ '<', '?', 'x', 'm', 'l' };
e74dfcb
+
e74dfcb
+	DBG_MSG("bIsXMLFile");
e74dfcb
+
e74dfcb
+	return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
e74dfcb
+} /* end of bIsXMLFile */
e74dfcb
+
e74dfcb
+/*
e74dfcb
+ * This function checks whether the given file is or is not a HTML file
e74dfcb
+ */
e74dfcb
+BOOL
e74dfcb
+bIsHTMLFile(FILE *pFile)
e74dfcb
+{
e74dfcb
+	static const UCHAR	aucBytes[2][5] = {
e74dfcb
+		{ '<', 'h', 't', 'm', 'l' },
e74dfcb
+		{ '<', 'H', 'T', 'M', 'L' },
e74dfcb
+	};
e74dfcb
+	int	iIndex;
e74dfcb
+
e74dfcb
+	DBG_MSG("bIsHTMLFile");
e74dfcb
+
e74dfcb
+	for (iIndex = 0; iIndex < (int)elementsof(aucBytes); iIndex++) {
e74dfcb
+		if (bCheckBytes(pFile,
e74dfcb
+				aucBytes[iIndex],
e74dfcb
+				elementsof(aucBytes[iIndex]))) {
e74dfcb
+			return TRUE;
e74dfcb
+		}
e74dfcb
+	}
e74dfcb
+	return FALSE;
e74dfcb
+} /* end of bIsHTMLFile */
e74dfcb
+
e74dfcb
+/*
e74dfcb
  * This function checks whether the given file is or is not a "Win Word 1 or 2"
e74dfcb
  * document
e74dfcb
  */
e74dfcb
 BOOL
e74dfcb
 bIsWinWord12File(FILE *pFile, long lFilesize)
e74dfcb
 {
e74dfcb
-	static UCHAR	aucBytes[2][4] = {
e74dfcb
+	static const UCHAR	aucBytes[2][4] = {
e74dfcb
 		{ 0x9b, 0xa5, 0x21, 0x00 },	/* Win Word 1.x */
e74dfcb
 		{ 0xdb, 0xa5, 0x2d, 0x00 },	/* Win Word 2.0 */
e74dfcb
 	};
e74dfcb
@@ -171,7 +223,7 @@
e74dfcb
 BOOL
e74dfcb
 bIsMacWord45File(FILE *pFile)
e74dfcb
 {
e74dfcb
-	static UCHAR	aucBytes[2][6] = {
e74dfcb
+	static const UCHAR	aucBytes[2][6] = {
e74dfcb
 		{ 0xfe, 0x37, 0x00, 0x1c, 0x00, 0x00 },	/* Mac Word 4 */
e74dfcb
 		{ 0xfe, 0x37, 0x00, 0x23, 0x00, 0x00 },	/* Mac Word 5 */
e74dfcb
 	};