For God so loved the world, that He gave His only begotten Son, that all who believe in Him should not perish but have everlasting life
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

549 lines
21 KiB

  1. <?php
  2. if (!defined('PHPEXCEL_ROOT')) {
  3. /**
  4. * @ignore
  5. */
  6. define('PHPEXCEL_ROOT', dirname(__FILE__) . '/../../');
  7. require(PHPEXCEL_ROOT . 'PHPExcel/Autoloader.php');
  8. }
  9. /**
  10. * PHPExcel_Reader_HTML
  11. *
  12. * Copyright (c) 2006 - 2015 PHPExcel
  13. *
  14. * This library is free software; you can redistribute it and/or
  15. * modify it under the terms of the GNU Lesser General Public
  16. * License as published by the Free Software Foundation; either
  17. * version 2.1 of the License, or (at your option) any later version.
  18. *
  19. * This library is distributed in the hope that it will be useful,
  20. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  21. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  22. * Lesser General Public License for more details.
  23. *
  24. * You should have received a copy of the GNU Lesser General Public
  25. * License along with this library; if not, write to the Free Software
  26. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  27. *
  28. * @category PHPExcel
  29. * @package PHPExcel_Reader
  30. * @copyright Copyright (c) 2006 - 2015 PHPExcel (http://www.codeplex.com/PHPExcel)
  31. * @license http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt LGPL
  32. * @version ##VERSION##, ##DATE##
  33. */
  34. /** PHPExcel root directory */
  35. class PHPExcel_Reader_HTML extends PHPExcel_Reader_Abstract implements PHPExcel_Reader_IReader
  36. {
  37. /**
  38. * Input encoding
  39. *
  40. * @var string
  41. */
  42. protected $inputEncoding = 'ANSI';
  43. /**
  44. * Sheet index to read
  45. *
  46. * @var int
  47. */
  48. protected $sheetIndex = 0;
  49. /**
  50. * Formats
  51. *
  52. * @var array
  53. */
  54. protected $formats = array(
  55. 'h1' => array(
  56. 'font' => array(
  57. 'bold' => true,
  58. 'size' => 24,
  59. ),
  60. ), // Bold, 24pt
  61. 'h2' => array(
  62. 'font' => array(
  63. 'bold' => true,
  64. 'size' => 18,
  65. ),
  66. ), // Bold, 18pt
  67. 'h3' => array(
  68. 'font' => array(
  69. 'bold' => true,
  70. 'size' => 13.5,
  71. ),
  72. ), // Bold, 13.5pt
  73. 'h4' => array(
  74. 'font' => array(
  75. 'bold' => true,
  76. 'size' => 12,
  77. ),
  78. ), // Bold, 12pt
  79. 'h5' => array(
  80. 'font' => array(
  81. 'bold' => true,
  82. 'size' => 10,
  83. ),
  84. ), // Bold, 10pt
  85. 'h6' => array(
  86. 'font' => array(
  87. 'bold' => true,
  88. 'size' => 7.5,
  89. ),
  90. ), // Bold, 7.5pt
  91. 'a' => array(
  92. 'font' => array(
  93. 'underline' => true,
  94. 'color' => array(
  95. 'argb' => PHPExcel_Style_Color::COLOR_BLUE,
  96. ),
  97. ),
  98. ), // Blue underlined
  99. 'hr' => array(
  100. 'borders' => array(
  101. 'bottom' => array(
  102. 'style' => PHPExcel_Style_Border::BORDER_THIN,
  103. 'color' => array(
  104. PHPExcel_Style_Color::COLOR_BLACK,
  105. ),
  106. ),
  107. ),
  108. ), // Bottom border
  109. );
  110. protected $rowspan = array();
  111. /**
  112. * Create a new PHPExcel_Reader_HTML
  113. */
  114. public function __construct()
  115. {
  116. $this->readFilter = new PHPExcel_Reader_DefaultReadFilter();
  117. }
  118. /**
  119. * Validate that the current file is an HTML file
  120. *
  121. * @return boolean
  122. */
  123. protected function isValidFormat()
  124. {
  125. // Reading 2048 bytes should be enough to validate that the format is HTML
  126. $data = fread($this->fileHandle, 2048);
  127. if ((strpos($data, '<') !== false) &&
  128. (strlen($data) !== strlen(strip_tags($data)))) {
  129. return true;
  130. }
  131. return false;
  132. }
  133. /**
  134. * Loads PHPExcel from file
  135. *
  136. * @param string $pFilename
  137. * @return PHPExcel
  138. * @throws PHPExcel_Reader_Exception
  139. */
  140. public function load($pFilename)
  141. {
  142. // Create new PHPExcel
  143. $objPHPExcel = new PHPExcel();
  144. // Load into this instance
  145. return $this->loadIntoExisting($pFilename, $objPHPExcel);
  146. }
  147. /**
  148. * Set input encoding
  149. *
  150. * @param string $pValue Input encoding
  151. */
  152. public function setInputEncoding($pValue = 'ANSI')
  153. {
  154. $this->inputEncoding = $pValue;
  155. return $this;
  156. }
  157. /**
  158. * Get input encoding
  159. *
  160. * @return string
  161. */
  162. public function getInputEncoding()
  163. {
  164. return $this->inputEncoding;
  165. }
  166. // Data Array used for testing only, should write to PHPExcel object on completion of tests
  167. protected $dataArray = array();
  168. protected $tableLevel = 0;
  169. protected $nestedColumn = array('A');
  170. protected function setTableStartColumn($column)
  171. {
  172. if ($this->tableLevel == 0) {
  173. $column = 'A';
  174. }
  175. ++$this->tableLevel;
  176. $this->nestedColumn[$this->tableLevel] = $column;
  177. return $this->nestedColumn[$this->tableLevel];
  178. }
  179. protected function getTableStartColumn()
  180. {
  181. return $this->nestedColumn[$this->tableLevel];
  182. }
  183. protected function releaseTableStartColumn()
  184. {
  185. --$this->tableLevel;
  186. return array_pop($this->nestedColumn);
  187. }
  188. protected function flushCell($sheet, $column, $row, &$cellContent)
  189. {
  190. if (is_string($cellContent)) {
  191. // Simple String content
  192. if (trim($cellContent) > '') {
  193. // Only actually write it if there's content in the string
  194. // echo 'FLUSH CELL: ' , $column , $row , ' => ' , $cellContent , '<br />';
  195. // Write to worksheet to be done here...
  196. // ... we return the cell so we can mess about with styles more easily
  197. $sheet->setCellValue($column . $row, $cellContent, true);
  198. $this->dataArray[$row][$column] = $cellContent;
  199. }
  200. } else {
  201. // We have a Rich Text run
  202. // TODO
  203. $this->dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent;
  204. }
  205. $cellContent = (string) '';
  206. }
  207. protected function processDomElement(DOMNode $element, $sheet, &$row, &$column, &$cellContent, $format = null)
  208. {
  209. foreach ($element->childNodes as $child) {
  210. if ($child instanceof DOMText) {
  211. $domText = preg_replace('/\s+/u', ' ', trim($child->nodeValue));
  212. if (is_string($cellContent)) {
  213. // simply append the text if the cell content is a plain text string
  214. $cellContent .= $domText;
  215. } else {
  216. // but if we have a rich text run instead, we need to append it correctly
  217. // TODO
  218. }
  219. } elseif ($child instanceof DOMElement) {
  220. // echo '<b>DOM ELEMENT: </b>' , strtoupper($child->nodeName) , '<br />';
  221. $attributeArray = array();
  222. foreach ($child->attributes as $attribute) {
  223. // echo '<b>ATTRIBUTE: </b>' , $attribute->name , ' => ' , $attribute->value , '<br />';
  224. $attributeArray[$attribute->name] = $attribute->value;
  225. }
  226. switch ($child->nodeName) {
  227. case 'meta':
  228. foreach ($attributeArray as $attributeName => $attributeValue) {
  229. switch ($attributeName) {
  230. case 'content':
  231. // TODO
  232. // Extract character set, so we can convert to UTF-8 if required
  233. break;
  234. }
  235. }
  236. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  237. break;
  238. case 'title':
  239. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  240. $sheet->setTitle($cellContent);
  241. $cellContent = '';
  242. break;
  243. case 'span':
  244. case 'div':
  245. case 'font':
  246. case 'i':
  247. case 'em':
  248. case 'strong':
  249. case 'b':
  250. // echo 'STYLING, SPAN OR DIV<br />';
  251. if ($cellContent > '') {
  252. $cellContent .= ' ';
  253. }
  254. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  255. if ($cellContent > '') {
  256. $cellContent .= ' ';
  257. }
  258. // echo 'END OF STYLING, SPAN OR DIV<br />';
  259. break;
  260. case 'hr':
  261. $this->flushCell($sheet, $column, $row, $cellContent);
  262. ++$row;
  263. if (isset($this->formats[$child->nodeName])) {
  264. $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
  265. } else {
  266. $cellContent = '----------';
  267. $this->flushCell($sheet, $column, $row, $cellContent);
  268. }
  269. ++$row;
  270. // Add a break after a horizontal rule, simply by allowing the code to dropthru
  271. case 'br':
  272. if ($this->tableLevel > 0) {
  273. // If we're inside a table, replace with a \n
  274. $cellContent .= "\n";
  275. } else {
  276. // Otherwise flush our existing content and move the row cursor on
  277. $this->flushCell($sheet, $column, $row, $cellContent);
  278. ++$row;
  279. }
  280. // echo 'HARD LINE BREAK: ' , '<br />';
  281. break;
  282. case 'a':
  283. // echo 'START OF HYPERLINK: ' , '<br />';
  284. foreach ($attributeArray as $attributeName => $attributeValue) {
  285. switch ($attributeName) {
  286. case 'href':
  287. // echo 'Link to ' , $attributeValue , '<br />';
  288. $sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue);
  289. if (isset($this->formats[$child->nodeName])) {
  290. $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
  291. }
  292. break;
  293. }
  294. }
  295. $cellContent .= ' ';
  296. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  297. // echo 'END OF HYPERLINK:' , '<br />';
  298. break;
  299. case 'h1':
  300. case 'h2':
  301. case 'h3':
  302. case 'h4':
  303. case 'h5':
  304. case 'h6':
  305. case 'ol':
  306. case 'ul':
  307. case 'p':
  308. if ($this->tableLevel > 0) {
  309. // If we're inside a table, replace with a \n
  310. $cellContent .= "\n";
  311. // echo 'LIST ENTRY: ' , '<br />';
  312. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  313. // echo 'END OF LIST ENTRY:' , '<br />';
  314. } else {
  315. if ($cellContent > '') {
  316. $this->flushCell($sheet, $column, $row, $cellContent);
  317. $row++;
  318. }
  319. // echo 'START OF PARAGRAPH: ' , '<br />';
  320. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  321. // echo 'END OF PARAGRAPH:' , '<br />';
  322. $this->flushCell($sheet, $column, $row, $cellContent);
  323. if (isset($this->formats[$child->nodeName])) {
  324. $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
  325. }
  326. $row++;
  327. $column = 'A';
  328. }
  329. break;
  330. case 'li':
  331. if ($this->tableLevel > 0) {
  332. // If we're inside a table, replace with a \n
  333. $cellContent .= "\n";
  334. // echo 'LIST ENTRY: ' , '<br />';
  335. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  336. // echo 'END OF LIST ENTRY:' , '<br />';
  337. } else {
  338. if ($cellContent > '') {
  339. $this->flushCell($sheet, $column, $row, $cellContent);
  340. }
  341. ++$row;
  342. // echo 'LIST ENTRY: ' , '<br />';
  343. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  344. // echo 'END OF LIST ENTRY:' , '<br />';
  345. $this->flushCell($sheet, $column, $row, $cellContent);
  346. $column = 'A';
  347. }
  348. break;
  349. case 'table':
  350. $this->flushCell($sheet, $column, $row, $cellContent);
  351. $column = $this->setTableStartColumn($column);
  352. // echo 'START OF TABLE LEVEL ' , $this->tableLevel , '<br />';
  353. if ($this->tableLevel > 1) {
  354. --$row;
  355. }
  356. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  357. // echo 'END OF TABLE LEVEL ' , $this->tableLevel , '<br />';
  358. $column = $this->releaseTableStartColumn();
  359. if ($this->tableLevel > 1) {
  360. ++$column;
  361. } else {
  362. ++$row;
  363. }
  364. break;
  365. case 'thead':
  366. case 'tbody':
  367. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  368. break;
  369. case 'tr':
  370. $column = $this->getTableStartColumn();
  371. $cellContent = '';
  372. // echo 'START OF TABLE ' , $this->tableLevel , ' ROW<br />';
  373. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  374. ++$row;
  375. // echo 'END OF TABLE ' , $this->tableLevel , ' ROW<br />';
  376. break;
  377. case 'th':
  378. case 'td':
  379. // echo 'START OF TABLE ' , $this->tableLevel , ' CELL<br />';
  380. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  381. // echo 'END OF TABLE ' , $this->tableLevel , ' CELL<br />';
  382. while (isset($this->rowspan[$column . $row])) {
  383. ++$column;
  384. }
  385. $this->flushCell($sheet, $column, $row, $cellContent);
  386. // if (isset($attributeArray['style']) && !empty($attributeArray['style'])) {
  387. // $styleAry = $this->getPhpExcelStyleArray($attributeArray['style']);
  388. //
  389. // if (!empty($styleAry)) {
  390. // $sheet->getStyle($column . $row)->applyFromArray($styleAry);
  391. // }
  392. // }
  393. if (isset($attributeArray['rowspan']) && isset($attributeArray['colspan'])) {
  394. //create merging rowspan and colspan
  395. $columnTo = $column;
  396. for ($i = 0; $i < $attributeArray['colspan'] - 1; $i++) {
  397. ++$columnTo;
  398. }
  399. $range = $column . $row . ':' . $columnTo . ($row + $attributeArray['rowspan'] - 1);
  400. foreach (\PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) {
  401. $this->rowspan[$value] = true;
  402. }
  403. $sheet->mergeCells($range);
  404. $column = $columnTo;
  405. } elseif (isset($attributeArray['rowspan'])) {
  406. //create merging rowspan
  407. $range = $column . $row . ':' . $column . ($row + $attributeArray['rowspan'] - 1);
  408. foreach (\PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) {
  409. $this->rowspan[$value] = true;
  410. }
  411. $sheet->mergeCells($range);
  412. } elseif (isset($attributeArray['colspan'])) {
  413. //create merging colspan
  414. $columnTo = $column;
  415. for ($i = 0; $i < $attributeArray['colspan'] - 1; $i++) {
  416. ++$columnTo;
  417. }
  418. $sheet->mergeCells($column . $row . ':' . $columnTo . $row);
  419. $column = $columnTo;
  420. }
  421. ++$column;
  422. break;
  423. case 'body':
  424. $row = 1;
  425. $column = 'A';
  426. $content = '';
  427. $this->tableLevel = 0;
  428. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  429. break;
  430. default:
  431. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  432. }
  433. }
  434. }
  435. }
  436. /**
  437. * Loads PHPExcel from file into PHPExcel instance
  438. *
  439. * @param string $pFilename
  440. * @param PHPExcel $objPHPExcel
  441. * @return PHPExcel
  442. * @throws PHPExcel_Reader_Exception
  443. */
  444. public function loadIntoExisting($pFilename, PHPExcel $objPHPExcel)
  445. {
  446. // Open file to validate
  447. $this->openFile($pFilename);
  448. if (!$this->isValidFormat()) {
  449. fclose($this->fileHandle);
  450. throw new PHPExcel_Reader_Exception($pFilename . " is an Invalid HTML file.");
  451. }
  452. // Close after validating
  453. fclose($this->fileHandle);
  454. // Create new PHPExcel
  455. while ($objPHPExcel->getSheetCount() <= $this->sheetIndex) {
  456. $objPHPExcel->createSheet();
  457. }
  458. $objPHPExcel->setActiveSheetIndex($this->sheetIndex);
  459. // Create a new DOM object
  460. $dom = new domDocument;
  461. // Reload the HTML file into the DOM object
  462. $loaded = $dom->loadHTML(mb_convert_encoding($this->securityScanFile($pFilename), 'HTML-ENTITIES', 'UTF-8'));
  463. if ($loaded === false) {
  464. throw new PHPExcel_Reader_Exception('Failed to load ' . $pFilename . ' as a DOM Document');
  465. }
  466. // Discard white space
  467. $dom->preserveWhiteSpace = false;
  468. $row = 0;
  469. $column = 'A';
  470. $content = '';
  471. $this->processDomElement($dom, $objPHPExcel->getActiveSheet(), $row, $column, $content);
  472. // Return
  473. return $objPHPExcel;
  474. }
  475. /**
  476. * Get sheet index
  477. *
  478. * @return int
  479. */
  480. public function getSheetIndex()
  481. {
  482. return $this->sheetIndex;
  483. }
  484. /**
  485. * Set sheet index
  486. *
  487. * @param int $pValue Sheet index
  488. * @return PHPExcel_Reader_HTML
  489. */
  490. public function setSheetIndex($pValue = 0)
  491. {
  492. $this->sheetIndex = $pValue;
  493. return $this;
  494. }
  495. /**
  496. * Scan theXML for use of <!ENTITY to prevent XXE/XEE attacks
  497. *
  498. * @param string $xml
  499. * @throws PHPExcel_Reader_Exception
  500. */
  501. public function securityScan($xml)
  502. {
  503. $pattern = '/\\0?' . implode('\\0?', str_split('<!ENTITY')) . '\\0?/';
  504. if (preg_match($pattern, $xml)) {
  505. throw new PHPExcel_Reader_Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
  506. }
  507. return $xml;
  508. }
  509. }