From 5d78ecbe2577875294a2676577ca34bb2bf8e369 Mon Sep 17 00:00:00 2001 From: Sergey Biryukov Date: Mon, 22 Aug 2022 15:37:59 +0000 Subject: [PATCH] Database: Account for `utf8` being renamed to `utf8mb3` in newer MariaDB and MySQL versions. From [https://mariadb.com/kb/en/mariadb-1061-release-notes/ MariaDB 10.6.1 release notes]: > The `utf8` [https://mariadb.com/kb/en/character-sets/ character set] (and related collations) is now by default an alias for `utf8mb3` rather than the other way around. It can be set to imply `utf8mb4` by changing the value of the [https://mariadb.com/kb/en/server-system-variables/#old_mode old_mode] system variable ([https://jira.mariadb.org/browse/MDEV-8334 MDEV-8334]). From [https://dev.mysql.com/doc/relnotes/mysql/8.0/en/news-8-0-30.html#mysqld-8-0-30-charset MySQL 8.0.30 release notes]: > **Important Change:** A previous change renamed character sets having deprecated names prefixed with `utf8_` to use `utf8mb3_` instead. In this release, we rename the `utf8_` collations as well, using the `utf8mb3_` prefix; this is to make the collation names consistent with those of the character sets, not to rely any longer on the deprecated collation names, and to clarify the distinction between `utf8mb3` and `utf8mb4`. The names using the `utf8mb3_` prefix are now used exclusively for these collations in the output of `SHOW` statements such as `SHOW CREATE TABLE`, as well as in the values displayed in the columns of Information Schema tables including the `COLLATIONS` and `COLUMNS` tables. This commit adds `utf8mb3_bin` and `utf8mb3_general_ci` to the list of safe collations recognized by `wpdb::check_safe_collation()`. The full list is now as follows: * `utf8_bin` * `utf8_general_ci` * `utf8mb3_bin` * `utf8mb3_general_ci` * `utf8mb4_bin` * `utf8mb4_general_ci` The change is covered by existing database charset unit tests: six tests which previously failed on MariaDB 10.6.1+ or MySQL 8.0.30+ now pass. Includes: * Adjusting the expected test results based on MariaDB and MySQL version. * Using named data providers for the affected tests to make test output more descriptive. * Adding a failure message to each assertion when multiple assertions are used in the test. References: * [https://mariadb.com/kb/en/mariadb-1061-release-notes/ MariaDB 10.6.1 release notes] * [https://jira.mariadb.org/browse/MDEV-8334 MDEV-8334 Rename utf8 to utf8mb3] * [https://dev.mysql.com/doc/relnotes/mysql/8.0/en/news-8-0-30.html#mysqld-8-0-30-charset MySQL 8.0.30 release notes] * [https://dev.mysql.com/doc/refman/8.0/en/charset-unicode-utf8mb3.html The utf8mb3 Character Set (3-Byte UTF-8 Unicode Encoding)] Follow-up to [30345], [32162], [37320]. Props skithund, ayeshrajans, JavierCasares, SergeyBiryukov. Fixes #53623. git-svn-id: https://develop.svn.wordpress.org/trunk@53918 602fd350-edb4-49c9-b593-d223f7449a82 --- src/wp-includes/class-wpdb.php | 11 ++- tests/phpunit/tests/db/charset.php | 146 ++++++++++++++++++++--------- 2 files changed, 111 insertions(+), 46 deletions(-) diff --git a/src/wp-includes/class-wpdb.php b/src/wp-includes/class-wpdb.php index 816f8a13e5..81a43900f5 100644 --- a/src/wp-includes/class-wpdb.php +++ b/src/wp-includes/class-wpdb.php @@ -3376,12 +3376,21 @@ class wpdb { } // If any of the columns don't have one of these collations, it needs more sanity checking. + $safe_collations = array( + 'utf8_bin', + 'utf8_general_ci', + 'utf8mb3_bin', + 'utf8mb3_general_ci', + 'utf8mb4_bin', + 'utf8mb4_general_ci', + ); + foreach ( $this->col_meta[ $table ] as $col ) { if ( empty( $col->Collation ) ) { continue; } - if ( ! in_array( $col->Collation, array( 'utf8_general_ci', 'utf8_bin', 'utf8mb4_general_ci', 'utf8mb4_bin' ), true ) ) { + if ( ! in_array( $col->Collation, $safe_collations, true ) ) { return false; } } diff --git a/tests/phpunit/tests/db/charset.php b/tests/phpunit/tests/db/charset.php index 1ccdf183ce..35135c8e7c 100644 --- a/tests/phpunit/tests/db/charset.php +++ b/tests/phpunit/tests/db/charset.php @@ -9,18 +9,32 @@ class Tests_DB_Charset extends WP_UnitTestCase { /** - * Our special WPDB + * Our special WPDB. * * @var resource */ protected static $_wpdb; /** - * The version of the MySQL server. + * Whether to expect utf8mb3 instead of utf8 in various commands output. + * + * @var bool + */ + private static $utf8_is_utf8mb3 = false; + + /** + * The database server version. * * @var string */ - private static $server_info; + private static $db_version; + + /** + * Full database server information. + * + * @var string + */ + private static $db_server_info; public static function set_up_before_class() { parent::set_up_before_class(); @@ -29,7 +43,18 @@ class Tests_DB_Charset extends WP_UnitTestCase { self::$_wpdb = new WpdbExposedMethodsForTesting(); - self::$server_info = self::$_wpdb->db_server_info(); + self::$db_version = self::$_wpdb->db_version(); + self::$db_server_info = self::$_wpdb->db_server_info(); + + /* + * MariaDB 10.6.1 or later and MySQL 8.0.30 or later + * use utf8mb3 instead of utf8 in various commands output. + */ + if ( str_contains( self::$db_server_info, 'MariaDB' ) && version_compare( self::$db_version, '10.6.1', '>=' ) + || ! str_contains( self::$db_server_info, 'MariaDB' ) && version_compare( self::$db_version, '8.0.30', '>=' ) + ) { + self::$utf8_is_utf8mb3 = true; + } } /** @@ -492,7 +517,9 @@ class Tests_DB_Charset extends WP_UnitTestCase { $this->markTestSkipped( "The current MySQL server doesn't support the utf8mb4 character set." ); } - if ( 'big5' === $new_charset && 'byte' === $data[0]['length']['type'] && false !== strpos( self::$server_info, 'MariaDB' ) ) { + if ( 'big5' === $new_charset && 'byte' === $data[0]['length']['type'] + && str_contains( self::$db_server_info, 'MariaDB' ) + ) { $this->markTestSkipped( "MariaDB doesn't support this data set. See https://core.trac.wordpress.org/ticket/33171." ); } @@ -808,6 +835,10 @@ class Tests_DB_Charset extends WP_UnitTestCase { self::$_wpdb->query( $create ); foreach ( $expected_charset as $column => $charset ) { + if ( self::$utf8_is_utf8mb3 && 'utf8' === $charset ) { + $charset = 'utf8mb3'; + } + $this->assertSame( $charset, self::$_wpdb->get_col_charset( $table, $column ) ); $this->assertSame( $charset, self::$_wpdb->get_col_charset( strtoupper( $table ), strtoupper( $column ) ) ); } @@ -875,27 +906,29 @@ class Tests_DB_Charset extends WP_UnitTestCase { public function data_strip_invalid_text_from_query() { $table_name = 'strip_invalid_text_from_query_table'; $data = array( - array( + 'utf8 + binary' => array( // Binary tables don't get stripped. - '( a VARCHAR(50) CHARACTER SET utf8, b BINARY )', // Create. - "('foo\xf0\x9f\x98\x88bar', 'foo')", // Query. - "('foo\xf0\x9f\x98\x88bar', 'foo')", // Expected result. + 'create' => '( a VARCHAR(50) CHARACTER SET utf8, b BINARY )', + 'query' => "('foo\xf0\x9f\x98\x88bar', 'foo')", + 'expected' => "('foo\xf0\x9f\x98\x88bar', 'foo')", ), - array( + 'utf8 + utf8mb4' => array( // utf8/utf8mb4 tables default to utf8. - '( a VARCHAR(50) CHARACTER SET utf8, b VARCHAR(50) CHARACTER SET utf8mb4 )', - "('foo\xf0\x9f\x98\x88bar', 'foo')", - "('foobar', 'foo')", + 'create' => '( a VARCHAR(50) CHARACTER SET utf8, b VARCHAR(50) CHARACTER SET utf8mb4 )', + 'query' => "('foo\xf0\x9f\x98\x88bar', 'foo')", + 'expected' => "('foobar', 'foo')", ), ); - foreach ( $data as $i => &$value ) { - $this_table_name = $table_name . '_' . $i; + $i = 0; - $value[0] = "CREATE TABLE $this_table_name {$value[0]}"; - $value[1] = "INSERT INTO $this_table_name VALUES {$value[1]}"; - $value[2] = "INSERT INTO $this_table_name VALUES {$value[2]}"; - $value[3] = "DROP TABLE IF EXISTS $this_table_name"; + foreach ( $data as &$value ) { + $this_table_name = $table_name . '_' . $i++; + + $value['create'] = "CREATE TABLE $this_table_name {$value['create']}"; + $value['query'] = "INSERT INTO $this_table_name VALUES {$value['query']}"; + $value['expected'] = "INSERT INTO $this_table_name VALUES {$value['expected']}"; + $value['drop'] = "DROP TABLE IF EXISTS $this_table_name"; } unset( $value ); @@ -979,42 +1012,44 @@ class Tests_DB_Charset extends WP_UnitTestCase { public function data_table_collation_check() { $table_name = 'table_collation_check'; $data = array( - array( + 'utf8_bin' => array( // utf8_bin tables don't need extra sanity checking. - '( a VARCHAR(50) COLLATE utf8_bin )', // Create. - true, // Expected result. + 'create' => '( a VARCHAR(50) COLLATE utf8_bin )', + 'expected' => true, ), - array( + 'utf8_general_ci' => array( // Neither do utf8_general_ci tables. - '( a VARCHAR(50) COLLATE utf8_general_ci )', - true, + 'create' => '( a VARCHAR(50) COLLATE utf8_general_ci )', + 'expected' => true, ), - array( + 'utf8_unicode_ci' => array( // utf8_unicode_ci tables do. - '( a VARCHAR(50) COLLATE utf8_unicode_ci )', - false, + 'create' => '( a VARCHAR(50) COLLATE utf8_unicode_ci )', + 'expected' => false, ), - array( + 'utf8_bin + big5_chinese_ci' => array( // utf8_bin tables don't need extra sanity checking, // except for when they're not just utf8_bin. - '( a VARCHAR(50) COLLATE utf8_bin, b VARCHAR(50) COLLATE big5_chinese_ci )', - false, + 'create' => '( a VARCHAR(50) COLLATE utf8_bin, b VARCHAR(50) COLLATE big5_chinese_ci )', + 'expected' => false, ), - array( + 'utf8_bin + int' => array( // utf8_bin tables don't need extra sanity checking // when the other columns aren't strings. - '( a VARCHAR(50) COLLATE utf8_bin, b INT )', - true, + 'create' => '( a VARCHAR(50) COLLATE utf8_bin, b INT )', + 'expected' => true, ), ); - foreach ( $data as $i => &$value ) { - $this_table_name = $table_name . '_' . $i; + $i = 0; - $value[0] = "CREATE TABLE $this_table_name {$value[0]}"; - $value[2] = "SELECT * FROM $this_table_name WHERE a='\xf0\x9f\x98\x88'"; - $value[3] = "DROP TABLE IF EXISTS $this_table_name"; - $value[4] = array( + foreach ( $data as &$value ) { + $this_table_name = $table_name . '_' . $i++; + + $value['create'] = "CREATE TABLE $this_table_name {$value['create']}"; + $value['query'] = "SELECT * FROM $this_table_name WHERE a='\xf0\x9f\x98\x88'"; + $value['drop'] = "DROP TABLE IF EXISTS $this_table_name"; + $value['always_true'] = array( "SELECT * FROM $this_table_name WHERE a='foo'", "SHOW FULL TABLES LIKE $this_table_name", "DESCRIBE $this_table_name", @@ -1040,11 +1075,31 @@ class Tests_DB_Charset extends WP_UnitTestCase { self::$_wpdb->query( $create ); $return = self::$_wpdb->check_safe_collation( $query ); - $this->assertSame( $expected, $return ); + $this->assertSame( + $expected, + $return, + sprintf( + "wpdb::check_safe_collation() should return %s for this query.\n" . + "Table: %s\n" . + 'Query: %s', + $expected ? 'true' : 'false', + $create, + $query + ) + ); foreach ( $always_true as $true_query ) { $return = self::$_wpdb->check_safe_collation( $true_query ); - $this->assertTrue( $return ); + $this->assertTrue( + $return, + sprintf( + "wpdb::check_safe_collation() should return true for this query.\n" . + "Table: %s\n" . + 'Query: %s', + $create, + $true_query + ) + ); } self::$_wpdb->query( $drop ); @@ -1115,12 +1170,13 @@ class Tests_DB_Charset extends WP_UnitTestCase { */ public function test_set_charset_changes_the_connection_collation() { self::$_wpdb->set_charset( self::$_wpdb->dbh, 'utf8', 'utf8_general_ci' ); - $results = self::$_wpdb->get_results( "SHOW VARIABLES WHERE Variable_name='collation_connection'" ); - $this->assertSame( 'utf8_general_ci', $results[0]->Value ); + $results = self::$_wpdb->get_results( "SHOW VARIABLES WHERE Variable_name='collation_connection'" ); + $expected = self::$utf8_is_utf8mb3 ? 'utf8mb3_general_ci' : 'utf8_general_ci'; + $this->assertSame( $expected, $results[0]->Value, "Collation should be set to $expected." ); self::$_wpdb->set_charset( self::$_wpdb->dbh, 'utf8mb4', 'utf8mb4_unicode_ci' ); $results = self::$_wpdb->get_results( "SHOW VARIABLES WHERE Variable_name='collation_connection'" ); - $this->assertSame( 'utf8mb4_unicode_ci', $results[0]->Value ); + $this->assertSame( 'utf8mb4_unicode_ci', $results[0]->Value, 'Collation should be set to utf8mb4_unicode_ci.' ); self::$_wpdb->set_charset( self::$_wpdb->dbh ); }