From f9a6a7aa565a6e555ea8439b076134363bd66a91 Mon Sep 17 00:00:00 2001 From: Andrew Ozz Date: Wed, 4 Mar 2015 19:28:53 +0000 Subject: [PATCH] PressThis: - Improve handling of the data, both from the bookmarklet and from server-side parsing. - Standardize on processing the data in PHP and remove duplicate code from JS. - Improve the bookmarklet code and remove pre-filtering of the data. Part props stephdau, see #31373. git-svn-id: https://develop.svn.wordpress.org/trunk@31609 602fd350-edb4-49c9-b593-d223f7449a82 --- src/wp-admin/includes/class-wp-press-this.php | 364 +++++++++++++----- src/wp-admin/js/bookmarklet.js | 49 +-- src/wp-admin/js/bookmarklet.min.js | 2 +- src/wp-admin/js/press-this.js | 124 +----- src/wp-includes/link-template.php | 2 +- 5 files changed, 293 insertions(+), 248 deletions(-) diff --git a/src/wp-admin/includes/class-wp-press-this.php b/src/wp-admin/includes/class-wp-press-this.php index 3ea927d3d2..0865f6ac70 100644 --- a/src/wp-admin/includes/class-wp-press-this.php +++ b/src/wp-admin/includes/class-wp-press-this.php @@ -40,7 +40,7 @@ class WP_Press_This { return array( // Used to trigger the bookmarklet update notice. // Needs to be set here and in get_shortcut_link() in wp-includes/link-template.php. - 'version' => '5', + 'version' => '6', /** * Filter whether or not Press This should redirect the user in the parent window upon save. @@ -278,7 +278,7 @@ class WP_Press_This { */ public function fetch_source_html( $url ) { // Download source page to tmp file. - $source_tmp_file = ( ! empty( $url ) ) ? download_url( $url ) : ''; + $source_tmp_file = ( ! empty( $url ) ) ? download_url( $url, 30 ) : ''; $source_content = ''; if ( ! is_wp_error( $source_tmp_file ) && file_exists( $source_tmp_file ) ) { @@ -318,6 +318,162 @@ class WP_Press_This { return $source_content; } + private function _limit_array( $value ) { + if ( is_array( $value ) ) { + if ( count( $value ) > 50 ) { + return array_slice( $value, 0, 50 ); + } + + return $value; + } + + return array(); + } + + private function _limit_string( $value ) { + $return = ''; + + if ( is_numeric( $value ) || is_bool( $value ) ) { + $return = (string) $value; + } else if ( is_string( $value ) ) { + if ( mb_strlen( $value ) > 5000 ) { + $return = mb_substr( $value, 0, 5000 ); + } else { + $return = $value; + } + + $return = html_entity_decode( $return, ENT_QUOTES, 'UTF-8' ); + $return = sanitize_text_field( trim( $return ) ); + } + + return $return; + } + + private function _limit_url( $url ) { + if ( ! is_string( $url ) ) { + return ''; + } + + $url = $this->_limit_string( $url ); + + // HTTP 1.1 allows 8000 chars but the "de-facto" standard supported in all current browsers is 2048. + if ( mb_strlen( $url ) > 2048 ) { + return ''; // Return empty rather than a trunacted/invalid URL + } + + // Only allow http(s) or protocol relative URLs. + if ( ! preg_match( '%^(https?:)?//%i', $url ) ) { + return ''; + } + + if ( strpos( $url, '"' ) !== false || strpos( $url, ' ' ) !== false ) { + return ''; + } + + return $url; + } + + private function _limit_img( $src ) { + $src = $this->_limit_url( $src ); + + if ( preg_match( '/\/ad[sx]{1}?\//', $src ) ) { + // Ads + return ''; + } else if ( preg_match( '/(\/share-?this[^\.]+?\.[a-z0-9]{3,4})(\?.*)?$/', $src ) ) { + // Share-this type button + return ''; + } else if ( preg_match( '/\/(spinner|loading|spacer|blank|rss)\.(gif|jpg|png)/', $src ) ) { + // Loaders, spinners, spacers + return ''; + } else if ( preg_match( '/\/([^\.\/]+[-_]{1})?(spinner|loading|spacer|blank)s?([-_]{1}[^\.\/]+)?\.[a-z0-9]{3,4}/', $src ) ) { + // Fancy loaders, spinners, spacers + return ''; + } else if ( preg_match( '/([^\.\/]+[-_]{1})?thumb[^.]*\.(gif|jpg|png)$/', $src ) ) { + // Thumbnails, too small, usually irrelevant to context + return ''; + } else if ( preg_match( '/\/wp-includes\//', $src ) ) { + // Classic WP interface images + return ''; + } else if ( preg_match( '/[^\d]{1}\d{1,2}x\d+\.(gif|jpg|png)$/', $src ) ) { + // Most often tiny buttons/thumbs (< 100px wide) + return ''; + } else if ( preg_match( '/\/pixel\.(mathtag|quantserve)\.com/', $src ) ) { + // See mathtag.com and https://www.quantcast.com/how-we-do-it/iab-standard-measurement/how-we-collect-data/ + return ''; + } else if ( false !== strpos( $src, '/g.gif' ) ) { + // Classic WP stats gif + return ''; + } + + return $src; + } + + private function _limit_embed( $src ) { + $src = $this->_limit_url( $src ); + + if ( preg_match( '/\/\/www\.youtube\.com\/(embed|v)\/([^\?]+)\?.+$/', $src, $src_matches ) ) { + $src = 'https://www.youtube.com/watch?v=' . $src_matches[2]; + } else if ( preg_match( '/\/\/player\.vimeo\.com\/video\/([\d]+)([\?\/]{1}.*)?$/', $src, $src_matches ) ) { + $src = 'https://vimeo.com/' . (int) $src_matches[1]; + } else if ( preg_match( '/\/\/vimeo\.com\/moogaloop\.swf\?clip_id=([\d]+)$/', $src, $src_matches ) ) { + $src = 'https://vimeo.com/' . (int) $src_matches[1]; + } else if ( preg_match( '/\/\/vine\.co\/v\/([^\/]+)\/embed/', $src, $src_matches ) ) { + $src = 'https://vine.co/v/' . $src_matches[1]; + } else if ( ! preg_match( '/\/\/(m\.|www\.)?youtube\.com\/watch\?/', $src ) + && ! preg_match( '/\/youtu\.be\/.+$/', $src ) + && ! preg_match( '/\/\/vimeo\.com\/[\d]+$/', $src ) + && ! preg_match( '/\/\/(www\.)?dailymotion\.com\/video\/.+$/', $src ) + && ! preg_match( '/\/\/soundcloud\.com\/.+$/', $src ) + && ! preg_match( '/\/\/twitter\.com\/[^\/]+\/status\/[\d]+$/', $src ) + && ! preg_match( '/\/\/vine\.co\/v\/[^\/]+/', $src ) ) { + $src = ''; + } + + return $src; + } + + private function _process_meta_entry( $meta_name, $meta_value, $data ) { + if ( preg_match( '/:?(title|description|keywords)$/', $meta_name ) ) { + $data['_meta'][ $meta_name ] = $meta_value; + } else { + switch ( $meta_name ) { + case 'og:url': + case 'og:video': + case 'og:video:secure_url': + $meta_value = $this->_limit_embed( $meta_value ); + + if ( ! isset( $data['_embed'] ) ) { + $data['_embed'] = array(); + } + + if ( ! empty( $meta_value ) && ! in_array( $meta_value, $data['_embed'] ) ) { + $data['_embed'][] = $meta_value; + } + + break; + case 'og:image': + case 'og:image:secure_url': + case 'twitter:image0:src': + case 'twitter:image0': + case 'twitter:image:src': + case 'twitter:image': + $meta_value = $this->_limit_img( $meta_value ); + + if ( ! isset( $data['_img'] ) ) { + $data['_img'] = array(); + } + + if ( ! empty( $meta_value ) && ! in_array( $meta_value, $data['_img'] ) ) { + $data['_img'][] = $meta_value; + } + + break; + } + } + + return $data; + } + /** * Fetches and parses _meta, _img, and _links data from the source. * @@ -339,18 +495,42 @@ class WP_Press_This { return array( 'errors' => $source_content->get_error_messages() ); } + // Fetch and gather data first, so discovered media is offered 1st to user. + if ( empty( $data['_meta'] ) ) { + $data['_meta'] = array(); + } + + if ( preg_match_all( '/]+>/', $source_content, $matches ) ) { + $items = $this->_limit_array( $matches[0] ); + + foreach ( $items as $value ) { + if ( preg_match( '/(property|name)="([^"]+)"[^>]+content="([^"]+)"/', $value, $new_matches ) ) { + $meta_name = $this->_limit_string( $new_matches[2] ); + $meta_value = $this->_limit_string( $new_matches[3] ); + + // Sanity check. $key is usually things like 'title', 'description', 'keywords', etc. + if ( strlen( $meta_name ) > 100 ) { + continue; + } + + $data = $this->_process_meta_entry( $meta_name, $meta_value, $data ); + } + } + } + // Fetch and gather data. if ( empty( $data['_img'] ) ) { $data['_img'] = array(); } - if ( preg_match_all( '//', $source_content, $matches ) ) { - if ( ! empty( $matches[0] ) ) { - foreach ( $matches[0] as $value ) { - if ( preg_match( '/]+src="([^"]+)"[^>]+\/>/', $value, $new_matches ) ) { - if ( ! in_array( $new_matches[1], $data['_img'] ) ) { - $data['_img'][] = $new_matches[1]; - } + if ( preg_match_all( '/]+>/', $source_content, $matches ) ) { + $items = $this->_limit_array( $matches[0] ); + + foreach ( $items as $value ) { + if ( preg_match( '/src=(\'|")([^\'"]+)\\1/', $value, $new_matches ) ) { + $src = $this->_limit_img( $new_matches[2] ); + if ( ! empty( $src ) && ! in_array( $src, $data['_img'] ) ) { + $data['_img'][] = $src; } } } @@ -361,66 +541,15 @@ class WP_Press_This { $data['_embed'] = array(); } - if ( preg_match_all( '/