You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
199 lines
7.6 KiB
199 lines
7.6 KiB
<?php
|
|
/**
|
|
* HTML Parsing class for TablePress, used for import of HTML files.
|
|
*
|
|
* @package TablePress
|
|
* @subpackage Import
|
|
* @author Tobias Bäthge
|
|
* @since 2.0.0
|
|
*/
|
|
|
|
// Prohibit direct script loading.
|
|
defined( 'ABSPATH' ) || die( 'No direct script access allowed!' );
|
|
|
|
/**
|
|
* HTML Parsing class
|
|
*
|
|
* @package TablePress
|
|
* @subpackage Import
|
|
* @author Tobias Bäthge
|
|
* @since 2.0.0
|
|
*/
|
|
abstract class HTML_Parser {
|
|
|
|
/**
|
|
* Parses HTML string into a two-dimensional array, maybe with options.
|
|
*
|
|
* @since 2.0.0
|
|
*
|
|
* @param string $html Data to be parsed.
|
|
* @return array<string, mixed>|WP_Error Array with table data and options (current table head and foot row) on success, WP_Error on error.
|
|
*/
|
|
public static function parse( string $html ) /* : array|WP_Error */ {
|
|
if ( false === stripos( $html, '<table' ) || false === stripos( $html, '</table>' ) ) {
|
|
return new WP_Error( 'table_import_html_no_table_found' );
|
|
}
|
|
|
|
// Prepend XML declaration, for better encoding support.
|
|
$full_html = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>' . $html;
|
|
if ( function_exists( 'libxml_disable_entity_loader' ) ) {
|
|
/*
|
|
* Don't expand external entities, see https://websec.io/2012/08/27/Preventing-XXE-in-PHP.html.
|
|
* Silence warnings as the function is deprecated in PHP 8, but can be necessary with LIBXML_NOENT being defined, see https://core.trac.wordpress.org/changeset/50714.
|
|
*/
|
|
@libxml_disable_entity_loader( true ); // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged,Generic.PHP.DeprecatedFunctions.Deprecated
|
|
}
|
|
// No warnings/errors raised, but stored internally.
|
|
libxml_use_internal_errors( true );
|
|
$dom = new DOMDocument( '1.0', 'UTF-8' );
|
|
// No strict checking for invalid HTML.
|
|
$dom->strictErrorChecking = false; // phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase
|
|
$result = $dom->loadHTML( $full_html );
|
|
if ( ! $result ) {
|
|
return new WP_Error( 'table_import_html_dom_load_html_failed' );
|
|
}
|
|
$dom_tables = $dom->getElementsByTagName( 'table' );
|
|
if ( 0 === count( $dom_tables ) ) {
|
|
return new WP_Error( 'table_import_html_dom_get_tables' );
|
|
}
|
|
libxml_clear_errors(); // Clear errors so that we only catch those inside the table in the next line.
|
|
$table = simplexml_import_dom( $dom_tables->item( 0 ) ); // @phpstan-ignore-line
|
|
if ( is_null( $table ) ) {
|
|
return new WP_Error( 'table_import_html_simplexml_import_dom_failed' );
|
|
}
|
|
|
|
$errors = libxml_get_errors();
|
|
libxml_clear_errors();
|
|
if ( ! empty( $errors ) ) {
|
|
$output = '<strong>' . __( 'The imported file contains errors:', 'tablepress' ) . '</strong><br /><br />';
|
|
foreach ( $errors as $error ) {
|
|
switch ( $error->level ) {
|
|
case LIBXML_ERR_WARNING:
|
|
$output .= "Warning {$error->code}: {$error->message} in line {$error->line}, column {$error->column}<br />";
|
|
break;
|
|
case LIBXML_ERR_ERROR:
|
|
$output .= "Error {$error->code}: {$error->message} in line {$error->line}, column {$error->column}<br />";
|
|
break;
|
|
case LIBXML_ERR_FATAL:
|
|
$output .= "Fatal Error {$error->code}: {$error->message} in line {$error->line}, column {$error->column}<br />";
|
|
break;
|
|
}
|
|
}
|
|
wp_die( $output, 'Import Error', array( 'response' => 200, 'back_link' => true ) );
|
|
}
|
|
|
|
$html_table = array(
|
|
'data' => array(),
|
|
'options' => array(),
|
|
);
|
|
if ( isset( $table->thead ) ) {
|
|
$html_table['data'] = array_merge( $html_table['data'], self::_import_html_rows( $table->thead[0]->tr ) ); // @phpstan-ignore-line
|
|
$html_table['options']['table_head'] = true;
|
|
}
|
|
if ( isset( $table->tbody ) ) {
|
|
$html_table['data'] = array_merge( $html_table['data'], self::_import_html_rows( $table->tbody[0]->tr ) ); // @phpstan-ignore-line
|
|
}
|
|
if ( isset( $table->tr ) ) {
|
|
$html_table['data'] = array_merge( $html_table['data'], self::_import_html_rows( $table->tr ) );
|
|
}
|
|
if ( isset( $table->tfoot ) ) {
|
|
$html_table['data'] = array_merge( $html_table['data'], self::_import_html_rows( $table->tfoot[0]->tr ) ); // @phpstan-ignore-line
|
|
$html_table['options']['table_foot'] = true;
|
|
}
|
|
|
|
return $html_table;
|
|
}
|
|
|
|
/**
|
|
* Converts table HTML rows to an array.
|
|
*
|
|
* @since 2.0.0
|
|
*
|
|
* @param SimpleXMLElement $element XMLElement.
|
|
* @return array<int, array<int, string>> SimpleXMLElement exported to an array.
|
|
*/
|
|
protected static function _import_html_rows( SimpleXMLElement $element ): array {
|
|
$rows = array(); // Container for the table data.
|
|
$rowspans = array(); // Container for information about rowspans in rows that follow the currently processed row.
|
|
|
|
$row_idx = 0;
|
|
foreach ( $element as $row ) {
|
|
// If all cells in a row should be merged with the cells in the row above, add the trigger word to each of them (should be very rare).
|
|
while ( isset( $rowspans[ $row_idx ] ) && count( $rowspans[ $row_idx ] ) === count( $rows[ $row_idx - 1 ] ) ) { // phpcs:ignore Squiz.PHP.DisallowSizeFunctionsInLoops.Found
|
|
$rows[] = $rowspans[ $row_idx ];
|
|
++$row_idx;
|
|
}
|
|
|
|
$new_row = array();
|
|
$column_idx = 0;
|
|
foreach ( $row as $cell ) {
|
|
// If a cell in a row should be merged with the cell above it, add the trigger word to it.
|
|
while ( isset( $rowspans[ $row_idx ][ $column_idx ] ) ) {
|
|
$new_row[] = $rowspans[ $row_idx ][ $column_idx ];
|
|
++$column_idx;
|
|
}
|
|
|
|
$cell_xml = $cell->asXml();
|
|
|
|
// Get content between <td>...</td>, or <th>...</th>, possibly with HTML.
|
|
if ( false !== $cell_xml && 1 === preg_match( '#<t[d|h].*?>(.*)</t[d|h]>#is', $cell_xml, $matches ) ) {
|
|
/*
|
|
* Decode HTML entities again, as there might be some left especially in attributes of HTML tags in the cells,
|
|
* see https://secure.php.net/manual/en/simplexmlelement.asxml.php#107137.
|
|
*/
|
|
$new_row[] = html_entity_decode( $matches[1], ENT_NOQUOTES, 'UTF-8' );
|
|
|
|
// Search for colspan and rowspan attributes in the cell's HTML tag.
|
|
$colspan = 1;
|
|
$rowspan = 1;
|
|
if ( 1 === preg_match( '#<t[d|h].*colspan=["\']?(\d+)["\']?.*?>#is', $cell_xml, $matches ) ) {
|
|
$colspan = (int) $matches[1];
|
|
}
|
|
if ( 1 === preg_match( '#<t[d|h].*rowspan=["\']?(\d+)["\']?.*?>#is', $cell_xml, $matches ) ) {
|
|
$rowspan = (int) $matches[1];
|
|
}
|
|
|
|
// Add cells with the colspan trigger word, if merged cells across columns were found.
|
|
for ( $i = 1; $i < $colspan; $i++ ) {
|
|
$new_row[] = '#colspan#';
|
|
}
|
|
|
|
// If merged cells across rows were found, add trigger words to a temporary variable.
|
|
for ( $i = 1; $i < $rowspan; $i++ ) {
|
|
if ( ! isset( $rowspans[ $row_idx + $i ] ) ) {
|
|
$rowspans[ $row_idx + $i ] = array();
|
|
}
|
|
$rowspans[ $row_idx + $i ][ $column_idx ] = '#rowspan#';
|
|
for ( $j = 1; $j < $colspan; $j++ ) {
|
|
$rowspans[ $row_idx + $i ][ $column_idx + $j ] = '#span#';
|
|
}
|
|
}
|
|
} else {
|
|
// Add an empty cell if no content could be extracted from the cell's HTML tag.
|
|
$new_row[] = '';
|
|
}
|
|
|
|
++$column_idx;
|
|
}
|
|
|
|
// After the last cell in a row: If a cell in a row should be merged with the cell above it, add the trigger word to it.
|
|
while ( isset( $rowspans[ $row_idx ][ $column_idx ] ) ) {
|
|
$new_row[] = $rowspans[ $row_idx ][ $column_idx ];
|
|
++$column_idx;
|
|
}
|
|
|
|
$rows[] = $new_row;
|
|
++$row_idx;
|
|
}
|
|
|
|
// After the last data row: If all cells in a row should be merged with the cells in the row above, add the trigger word to each of them (should be very rare).
|
|
while ( isset( $rowspans[ $row_idx ] ) && count( $rowspans[ $row_idx ] ) === count( $rows[ $row_idx - 1 ] ) ) { // phpcs:ignore Squiz.PHP.DisallowSizeFunctionsInLoops.Found
|
|
$rows[] = $rowspans[ $row_idx ];
|
|
++$row_idx;
|
|
}
|
|
|
|
return $rows;
|
|
}
|
|
|
|
} // class HTML_Parser
|