From 1d5d9e24c4cdb09d22d9819c6573a0d2d677f2b2 Mon Sep 17 00:00:00 2001
From: Jb Audras <audrasjb@git.wordpress.org>
Date: Thu, 21 Jul 2022 21:09:56 +0000
Subject: [PATCH] Formatting: Normalize to Unicode NFC encoding before
 converting accent characters in `remove_accents()`.

This changeset adds Unicode sequence normalization from NFD to NFC, via the `normalizer_normalize()` PHP function which is available with the recommended `intl` PHP extension.

This fixes an issue where NFD characters were not properly sanitized. It also provides a unit test for NFD sequences (alternate Unicode representations of the same characters).

Props NumidWasNotAvailable, targz, nacin, nunomorgadinho, p_enrique, gitlost, SergeyBiryukov, markoheijnen, mikeschroder, ocean90, pento, helen, rodrigosevero, zodiac1978, ironprogrammer, audrasjb, azaozz, laboiteare, nuryko, virgar, dxd5001, onnimonni, johnbillion.
Fixes #24661, #47763, #35951.
See #30130, #52654.


git-svn-id: https://develop.svn.wordpress.org/trunk@53754 602fd350-edb4-49c9-b593-d223f7449a82
---
 src/wp-includes/formatting.php                 | 10 ++++++++++
 .../phpunit/tests/formatting/removeAccents.php | 18 ++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
index 3b977f4f4e..46668daafe 100644
--- a/src/wp-includes/formatting.php
+++ b/src/wp-includes/formatting.php
@@ -1584,6 +1584,7 @@ function utf8_uri_encode( $utf8_string, $length = 0, $encode_ascii_characters =
  * @since 4.8.0 Added locale support for `bs_BA`.
  * @since 5.7.0 Added locale support for `de_AT`.
  * @since 6.0.0 Added the `$locale` parameter.
+ * @since 6.1.0 Added Unicode NFC encoding normalization support.
  *
  * @param string $string Text that might have accent characters.
  * @param string $locale Optional. The locale to use for accent removal. Some character
@@ -1597,6 +1598,15 @@ function remove_accents( $string, $locale = '' ) {
 	}
 
 	if ( seems_utf8( $string ) ) {
+
+		// Unicode sequence normalization from NFD (Normalization Form Decomposed)
+		// to NFC (Normalization Form [Pre]Composed), the encoding used in this function.
+		if ( function_exists( 'normalizer_normalize' ) ) {
+			if ( ! normalizer_is_normalized( $string, Normalizer::FORM_C ) ) {
+				$string = normalizer_normalize( $string, Normalizer::FORM_C );
+			}
+		}
+
 		$chars = array(
 			// Decompositions for Latin-1 Supplement.
 			'ª' => 'a',
diff --git a/tests/phpunit/tests/formatting/removeAccents.php b/tests/phpunit/tests/formatting/removeAccents.php
index 249625ca3b..8140d2df9b 100644
--- a/tests/phpunit/tests/formatting/removeAccents.php
+++ b/tests/phpunit/tests/formatting/removeAccents.php
@@ -11,6 +11,24 @@ class Tests_Formatting_RemoveAccents extends WP_UnitTestCase {
 		$this->assertSame( 'abcdefghijkl', remove_accents( 'abcdefghijkl' ) );
 	}
 
+	/**
+	 * @ticket 24661
+	 *
+	 * Tests Unicode sequence normalization from NFD (Normalization Form Decomposed)
+	 * to NFC (Normalization Form [Pre]Composed), the encoding used in `remove_accents()`.
+	 *
+	 * For more information on Unicode normalization, see
+	 * https://unicode.org/faq/normalization.html.
+	 *
+	 * @requires extension intl
+	 */
+	public function test_remove_accents_latin1_supplement_nfd_encoding() {
+		$input  = 'ªºÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ';
+		$output = 'aoAAAAAAAECEEEEIIIIDNOOOOOOUUUUYTHsaaaaaaaeceeeeiiiidnoooooouuuuythy';
+
+		$this->assertSame( $output, remove_accents( $input ), 'remove_accents replaces Latin-1 Supplement with NFD encoding' );
+	}
+
 	/**
 	 * @ticket 9591
 	 */