From 66a1fcc69765bb704146fe7d084848302dd3c89e Mon Sep 17 00:00:00 2001
From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
Date: Sat, 15 Jul 2023 17:33:52 +0200
Subject: [PATCH 2/3] Sanitize libxml2 globals before parsing

Fixes GHSA-3qrf-m4j2-pcrr.

To parse a document with libxml2, you first need to create a parsing context.
The parsing context contains parsing options (e.g. XML_NOENT to substitute
entities) that the application (in this case PHP) can set.
Unfortunately, libxml2 also supports providing default set options.
For example, if you call xmlSubstituteEntitiesDefault(1) then the XML_NOENT
option will be added to the parsing options every time you create a parsing
context **even if the application never requested XML_NOENT**.

Third party extensions can override these globals, in particular the
substitute entity global. This causes entity substitution to be
unexpectedly active.

Fix it by setting the parsing options to a sane known value.
For API calls that depend on global state we introduce
PHP_LIBXML_SANITIZE_GLOBALS() and PHP_LIBXML_RESTORE_GLOBALS().
For other APIs that work directly with a context we introduce
php_libxml_sanitize_parse_ctxt_options().

(cherry picked from commit c283c3ab0ba45d21b2b8745c1f9c7cbfe771c975)
(cherry picked from commit b3758bd21223b97c042cae7bd26a66cde081ea98)
(cherry picked from commit 4fb61f06b1aff89a4d7e548c37ffa5bf573270c3)
(cherry picked from commit d7de6908dfc8774e86a54100ad4e2ee810426001)
---
 ext/dom/document.c            | 15 +++++++++++++++
 ext/dom/documentfragment.c    |  2 ++
 ext/libxml/php_libxml.h       | 36 +++++++++++++++++++++++++++++++++++
 ext/simplexml/simplexml.c     |  6 ++++++
 ext/soap/php_xml.c            |  2 ++
 ext/xml/compat.c              |  2 ++
 ext/xmlreader/php_xmlreader.c |  9 +++++++++
 ext/xsl/xsltprocessor.c       |  9 ++++-----
 8 files changed, 76 insertions(+), 5 deletions(-)

diff --git a/ext/dom/document.c b/ext/dom/document.c
index c212faa695..b0eed5820b 100644
--- a/ext/dom/document.c
+++ b/ext/dom/document.c
@@ -1438,6 +1438,7 @@ static xmlDocPtr dom_document_parser(zval *id, int mode, char *source, size_t so
 		options |= XML_PARSE_NOBLANKS;
 	}
 
+	php_libxml_sanitize_parse_ctxt_options(ctxt);
 	xmlCtxtUseOptions(ctxt, options);
 
 	ctxt->recovery = recover;
@@ -1735,7 +1736,9 @@ PHP_FUNCTION(dom_document_xinclude)
 
 	DOM_GET_OBJ(docp, id, xmlDocPtr, intern);
 
+	PHP_LIBXML_SANITIZE_GLOBALS(xinclude);
 	err = xmlXIncludeProcessFlags(docp, (int)flags);
+	PHP_LIBXML_RESTORE_GLOBALS(xinclude);
 
 	/* XML_XINCLUDE_START and XML_XINCLUDE_END nodes need to be removed as these
 	are added via xmlXIncludeProcess to mark beginning and ending of xincluded document
@@ -1774,6 +1777,7 @@ PHP_FUNCTION(dom_document_validate)
 
 	DOM_GET_OBJ(docp, id, xmlDocPtr, intern);
 
+	PHP_LIBXML_SANITIZE_GLOBALS(validate);
 	cvp = xmlNewValidCtxt();
 
 	cvp->userData = NULL;
@@ -1785,6 +1789,7 @@ PHP_FUNCTION(dom_document_validate)
 	} else {
 		RETVAL_FALSE;
 	}
+	PHP_LIBXML_RESTORE_GLOBALS(validate);
 
 	xmlFreeValidCtxt(cvp);
 
@@ -1818,14 +1823,18 @@ static void _dom_document_schema_validate(INTERNAL_FUNCTION_PARAMETERS, int type
 
 	DOM_GET_OBJ(docp, id, xmlDocPtr, intern);
 
+	PHP_LIBXML_SANITIZE_GLOBALS(new_parser_ctxt);
+
 	switch (type) {
 	case DOM_LOAD_FILE:
 		if (CHECK_NULL_PATH(source, source_len)) {
+			PHP_LIBXML_RESTORE_GLOBALS(new_parser_ctxt);
 			php_error_docref(NULL, E_WARNING, "Invalid Schema file source");
 			RETURN_FALSE;
 		}
 		valid_file = _dom_get_valid_file_path(source, resolved_path, MAXPATHLEN);
 		if (!valid_file) {
+			PHP_LIBXML_RESTORE_GLOBALS(new_parser_ctxt);
 			php_error_docref(NULL, E_WARNING, "Invalid Schema file source");
 			RETURN_FALSE;
 		}
@@ -1846,6 +1855,7 @@ static void _dom_document_schema_validate(INTERNAL_FUNCTION_PARAMETERS, int type
 		parser);
 	sptr = xmlSchemaParse(parser);
 	xmlSchemaFreeParserCtxt(parser);
+	PHP_LIBXML_RESTORE_GLOBALS(new_parser_ctxt);
 	if (!sptr) {
 		php_error_docref(NULL, E_WARNING, "Invalid Schema");
 		RETURN_FALSE;
@@ -1866,11 +1876,13 @@ static void _dom_document_schema_validate(INTERNAL_FUNCTION_PARAMETERS, int type
 	}
 #endif
 
+	PHP_LIBXML_SANITIZE_GLOBALS(validate);
 	xmlSchemaSetValidOptions(vptr, valid_opts);
 	xmlSchemaSetValidErrors(vptr, php_libxml_error_handler, php_libxml_error_handler, vptr);
 	is_valid = xmlSchemaValidateDoc(vptr, docp);
 	xmlSchemaFree(sptr);
 	xmlSchemaFreeValidCtxt(vptr);
+	PHP_LIBXML_RESTORE_GLOBALS(validate);
 
 	if (is_valid == 0) {
 		RETURN_TRUE;
@@ -1940,12 +1952,14 @@ static void _dom_document_relaxNG_validate(INTERNAL_FUNCTION_PARAMETERS, int typ
 		return;
 	}
 
+	PHP_LIBXML_SANITIZE_GLOBALS(parse);
 	xmlRelaxNGSetParserErrors(parser,
 		(xmlRelaxNGValidityErrorFunc) php_libxml_error_handler,
 		(xmlRelaxNGValidityWarningFunc) php_libxml_error_handler,
 		parser);
 	sptr = xmlRelaxNGParse(parser);
 	xmlRelaxNGFreeParserCtxt(parser);
+	PHP_LIBXML_RESTORE_GLOBALS(parse);
 	if (!sptr) {
 		php_error_docref(NULL, E_WARNING, "Invalid RelaxNG");
 		RETURN_FALSE;
@@ -2045,6 +2059,7 @@ static void dom_load_html(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{ */
 		ctxt->sax->error = php_libxml_ctx_error;
 		ctxt->sax->warning = php_libxml_ctx_warning;
 	}
+	php_libxml_sanitize_parse_ctxt_options(ctxt);
 	if (options) {
 		htmlCtxtUseOptions(ctxt, (int)options);
 	}
diff --git a/ext/dom/documentfragment.c b/ext/dom/documentfragment.c
index 87cb691501..b7ecfdc14b 100644
--- a/ext/dom/documentfragment.c
+++ b/ext/dom/documentfragment.c
@@ -134,7 +134,9 @@ PHP_METHOD(domdocumentfragment, appendXML) {
 	}
 
 	if (data) {
+		PHP_LIBXML_SANITIZE_GLOBALS(parse);
 		err = xmlParseBalancedChunkMemory(nodep->doc, NULL, NULL, 0, (xmlChar *) data, &lst);
+		PHP_LIBXML_RESTORE_GLOBALS(parse);
 		if (err != 0) {
 			RETURN_FALSE;
 		}
diff --git a/ext/libxml/php_libxml.h b/ext/libxml/php_libxml.h
index f7aa726d88..371b4eecc5 100644
--- a/ext/libxml/php_libxml.h
+++ b/ext/libxml/php_libxml.h
@@ -122,6 +122,42 @@ PHP_LIBXML_API void php_libxml_shutdown(void);
 ZEND_TSRMLS_CACHE_EXTERN()
 #endif
 
+/* Other extension may override the global state options, these global options
+ * are copied initially to ctxt->options. Set the options to a known good value.
+ * See libxml2 globals.c and parserInternals.c.
+ * The unique_name argument allows multiple sanitizes and restores within the
+ * same function, even nested is necessary. */
+#define PHP_LIBXML_SANITIZE_GLOBALS(unique_name) \
+	int xml_old_loadsubset_##unique_name = xmlLoadExtDtdDefaultValue; \
+	xmlLoadExtDtdDefaultValue = 0; \
+	int xml_old_validate_##unique_name = xmlDoValidityCheckingDefaultValue; \
+	xmlDoValidityCheckingDefaultValue = 0; \
+	int xml_old_pedantic_##unique_name = xmlPedanticParserDefault(0); \
+	int xml_old_substitute_##unique_name = xmlSubstituteEntitiesDefault(0); \
+	int xml_old_linenrs_##unique_name = xmlLineNumbersDefault(0); \
+	int xml_old_blanks_##unique_name = xmlKeepBlanksDefault(1);
+
+#define PHP_LIBXML_RESTORE_GLOBALS(unique_name) \
+	xmlLoadExtDtdDefaultValue = xml_old_loadsubset_##unique_name; \
+	xmlDoValidityCheckingDefaultValue = xml_old_validate_##unique_name; \
+	(void) xmlPedanticParserDefault(xml_old_pedantic_##unique_name); \
+	(void) xmlSubstituteEntitiesDefault(xml_old_substitute_##unique_name); \
+	(void) xmlLineNumbersDefault(xml_old_linenrs_##unique_name); \
+	(void) xmlKeepBlanksDefault(xml_old_blanks_##unique_name);
+
+/* Alternative for above, working directly on the context and not setting globals.
+ * Generally faster because no locking is involved, and this has the advantage that it sets the options to a known good value. */
+static zend_always_inline void php_libxml_sanitize_parse_ctxt_options(xmlParserCtxtPtr ctxt)
+{
+	ctxt->loadsubset = 0;
+	ctxt->validate = 0;
+	ctxt->pedantic = 0;
+	ctxt->replaceEntities = 0;
+	ctxt->linenumbers = 0;
+	ctxt->keepBlanks = 1;
+	ctxt->options = 0;
+}
+
 #else /* HAVE_LIBXML */
 #define libxml_module_ptr NULL
 #endif
diff --git a/ext/simplexml/simplexml.c b/ext/simplexml/simplexml.c
index ae067cdcdd..bfac015964 100644
--- a/ext/simplexml/simplexml.c
+++ b/ext/simplexml/simplexml.c
@@ -2229,7 +2229,9 @@ PHP_FUNCTION(simplexml_load_file)
 		RETURN_FALSE;
 	}
 
+	PHP_LIBXML_SANITIZE_GLOBALS(read_file);
 	docp = xmlReadFile(filename, NULL, (int)options);
+	PHP_LIBXML_RESTORE_GLOBALS(read_file);
 
 	if (!docp) {
 		RETURN_FALSE;
@@ -2283,7 +2285,9 @@ PHP_FUNCTION(simplexml_load_string)
 		RETURN_FALSE;
 	}
 
+	PHP_LIBXML_SANITIZE_GLOBALS(read_memory);
 	docp = xmlReadMemory(data, (int)data_len, NULL, NULL, (int)options);
+	PHP_LIBXML_RESTORE_GLOBALS(read_memory);
 
 	if (!docp) {
 		RETURN_FALSE;
@@ -2333,7 +2337,9 @@ SXE_METHOD(__construct)
 		return;
 	}
 
+	PHP_LIBXML_SANITIZE_GLOBALS(read_file_or_memory);
 	docp = is_url ? xmlReadFile(data, NULL, (int)options) : xmlReadMemory(data, (int)data_len, NULL, NULL, (int)options);
+	PHP_LIBXML_RESTORE_GLOBALS(read_file_or_memory);
 
 	if (!docp) {
 		((php_libxml_node_object *)sxe)->document = NULL;
diff --git a/ext/soap/php_xml.c b/ext/soap/php_xml.c
index 1ac684eb81..053960c559 100644
--- a/ext/soap/php_xml.c
+++ b/ext/soap/php_xml.c
@@ -94,6 +94,7 @@ xmlDocPtr soap_xmlParseFile(const char *filename)
 	if (ctxt) {
 		zend_bool old;
 
+		php_libxml_sanitize_parse_ctxt_options(ctxt);
 		ctxt->keepBlanks = 0;
 		ctxt->sax->ignorableWhitespace = soap_ignorableWhitespace;
 		ctxt->sax->comment = soap_Comment;
@@ -144,6 +145,7 @@ xmlDocPtr soap_xmlParseMemory(const void *buf, size_t buf_size)
 	if (ctxt) {
 		zend_bool old;
 
+		php_libxml_sanitize_parse_ctxt_options(ctxt);
 		ctxt->sax->ignorableWhitespace = soap_ignorableWhitespace;
 		ctxt->sax->comment = soap_Comment;
 		ctxt->sax->warning = NULL;
diff --git a/ext/xml/compat.c b/ext/xml/compat.c
index ef83485722..c55047215b 100644
--- a/ext/xml/compat.c
+++ b/ext/xml/compat.c
@@ -19,6 +19,7 @@
 #include "php.h"
 #if defined(HAVE_LIBXML) && (defined(HAVE_XML) || defined(HAVE_XMLRPC)) && !defined(HAVE_LIBEXPAT)
 #include "expat_compat.h"
+#include "ext/libxml/php_libxml.h"
 
 typedef struct _php_xml_ns {
 	xmlNsPtr nsptr;
@@ -473,6 +474,7 @@ XML_ParserCreate_MM(const XML_Char *encoding, const XML_Memory_Handling_Suite *m
 	parser->parser->charset = XML_CHAR_ENCODING_NONE;
 #endif
 
+	php_libxml_sanitize_parse_ctxt_options(parser->parser);
 #if LIBXML_VERSION >= 20703
 	xmlCtxtUseOptions(parser->parser, XML_PARSE_OLDSAX);
 #endif
diff --git a/ext/xmlreader/php_xmlreader.c b/ext/xmlreader/php_xmlreader.c
index 0f2b62ae20..f0fd013ff9 100644
--- a/ext/xmlreader/php_xmlreader.c
+++ b/ext/xmlreader/php_xmlreader.c
@@ -301,6 +301,7 @@ static xmlRelaxNGPtr _xmlreader_get_relaxNG(char *source, size_t source_len, siz
 		return NULL;
 	}
 
+	PHP_LIBXML_SANITIZE_GLOBALS(parse);
 	if (error_func || warn_func) {
 		xmlRelaxNGSetParserErrors(parser,
 			(xmlRelaxNGValidityErrorFunc) error_func,
@@ -309,6 +310,7 @@ static xmlRelaxNGPtr _xmlreader_get_relaxNG(char *source, size_t source_len, siz
 	}
 	sptr = xmlRelaxNGParse(parser);
 	xmlRelaxNGFreeParserCtxt(parser);
+	PHP_LIBXML_RESTORE_GLOBALS(parse);
 
 	return sptr;
 }
@@ -881,7 +883,9 @@ PHP_METHOD(xmlreader, open)
 	valid_file = _xmlreader_get_valid_file_path(source, resolved_path, MAXPATHLEN );
 
 	if (valid_file) {
+		PHP_LIBXML_SANITIZE_GLOBALS(reader_for_file);
 		reader = xmlReaderForFile(valid_file, encoding, options);
+		PHP_LIBXML_RESTORE_GLOBALS(reader_for_file);
 	}
 
 	if (reader == NULL) {
@@ -959,7 +963,9 @@ PHP_METHOD(xmlreader, setSchema)
 
 	intern = Z_XMLREADER_P(id);
 	if (intern && intern->ptr) {
+		PHP_LIBXML_SANITIZE_GLOBALS(schema);
 		retval = xmlTextReaderSchemaValidate(intern->ptr, source);
+		PHP_LIBXML_RESTORE_GLOBALS(schema);
 
 		if (retval == 0) {
 			RETURN_TRUE;
@@ -1079,6 +1085,7 @@ PHP_METHOD(xmlreader, XML)
 			}
 			uri = (char *) xmlCanonicPath((const xmlChar *) resolved_path);
 		}
+		PHP_LIBXML_SANITIZE_GLOBALS(text_reader);
 		reader = xmlNewTextReader(inputbfr, uri);
 
 		if (reader != NULL) {
@@ -1099,9 +1106,11 @@ PHP_METHOD(xmlreader, XML)
 					xmlFree(uri);
 				}
 
+				PHP_LIBXML_RESTORE_GLOBALS(text_reader);
 				return;
 			}
 		}
+		PHP_LIBXML_RESTORE_GLOBALS(text_reader);
 	}
 
 	if (uri) {
diff --git a/ext/xsl/xsltprocessor.c b/ext/xsl/xsltprocessor.c
index 9948d6f0b3..7bbe640a5c 100644
--- a/ext/xsl/xsltprocessor.c
+++ b/ext/xsl/xsltprocessor.c
@@ -396,7 +396,7 @@ PHP_FUNCTION(xsl_xsltprocessor_import_stylesheet)
 	xmlDoc *doc = NULL, *newdoc = NULL;
 	xsltStylesheetPtr sheetp, oldsheetp;
 	xsl_object *intern;
-	int prevSubstValue, prevExtDtdValue, clone_docu = 0;
+	int clone_docu = 0;
 	xmlNode *nodep = NULL;
 	zend_object_handlers *std_hnd;
 	zval *cloneDocu, member, rv;
@@ -419,13 +419,12 @@ PHP_FUNCTION(xsl_xsltprocessor_import_stylesheet)
 	stylesheet document otherwise the node proxies will be a mess */
 	newdoc = xmlCopyDoc(doc, 1);
 	xmlNodeSetBase((xmlNodePtr) newdoc, (xmlChar *)doc->URL);
-	prevSubstValue = xmlSubstituteEntitiesDefault(1);
-	prevExtDtdValue = xmlLoadExtDtdDefaultValue;
+	PHP_LIBXML_SANITIZE_GLOBALS(parse);
+	xmlSubstituteEntitiesDefault(1);
 	xmlLoadExtDtdDefaultValue = XML_DETECT_IDS | XML_COMPLETE_ATTRS;
 
 	sheetp = xsltParseStylesheetDoc(newdoc);
-	xmlSubstituteEntitiesDefault(prevSubstValue);
-	xmlLoadExtDtdDefaultValue = prevExtDtdValue;
+	PHP_LIBXML_RESTORE_GLOBALS(parse);
 
 	if (!sheetp) {
 		xmlFreeDoc(newdoc);
-- 
2.41.0

From 3ac0ce8a462cb31815330d1410e1a8a615c395eb Mon Sep 17 00:00:00 2001
From: Remi Collet <remi@remirepo.net>
Date: Tue, 1 Aug 2023 07:22:33 +0200
Subject: [PATCH 3/3] NEWS

(cherry picked from commit ef1d507acf7be23d7624dc3c891683b2218feb51)
(cherry picked from commit 3cf7c2b10e577136b267f2d90bfdff6743271c5c)
(cherry picked from commit 79c0bf87711036b83f8ee1723c034ccc839d847b)
---
 NEWS | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/NEWS b/NEWS
index e7cdaa7698..99387fbfd7 100644
--- a/NEWS
+++ b/NEWS
@@ -1,6 +1,16 @@
 PHP                                                                        NEWS
 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
 
+Backported from 8.0.30
+
+- Libxml:
+  . Fixed bug GHSA-3qrf-m4j2-pcrr (Security issue with external entity loading
+    in XML without enabling it). (CVE-2023-3823) (nielsdos, ilutov)
+
+- Phar:
+  . Fixed bug GHSA-jqcx-ccgc-xwhv (Buffer mismanagement in phar_dir_read()).
+    (CVE-2023-3824) (nielsdos)
+
 Backported from 8.0.29
 
 - Soap:
-- 
2.41.0