123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549 |
- <?php
- if (!defined('PHPEXCEL_ROOT')) {
-
- define('PHPEXCEL_ROOT', dirname(__FILE__) . '/../../');
- require(PHPEXCEL_ROOT . 'PHPExcel/Autoloader.php');
- }
- class PHPExcel_Reader_HTML extends PHPExcel_Reader_Abstract implements PHPExcel_Reader_IReader
- {
-
- protected $inputEncoding = 'ANSI';
-
- protected $sheetIndex = 0;
-
- protected $formats = array(
- 'h1' => array(
- 'font' => array(
- 'bold' => true,
- 'size' => 24,
- ),
- ),
- 'h2' => array(
- 'font' => array(
- 'bold' => true,
- 'size' => 18,
- ),
- ),
- 'h3' => array(
- 'font' => array(
- 'bold' => true,
- 'size' => 13.5,
- ),
- ),
- 'h4' => array(
- 'font' => array(
- 'bold' => true,
- 'size' => 12,
- ),
- ),
- 'h5' => array(
- 'font' => array(
- 'bold' => true,
- 'size' => 10,
- ),
- ),
- 'h6' => array(
- 'font' => array(
- 'bold' => true,
- 'size' => 7.5,
- ),
- ),
- 'a' => array(
- 'font' => array(
- 'underline' => true,
- 'color' => array(
- 'argb' => PHPExcel_Style_Color::COLOR_BLUE,
- ),
- ),
- ),
- 'hr' => array(
- 'borders' => array(
- 'bottom' => array(
- 'style' => PHPExcel_Style_Border::BORDER_THIN,
- 'color' => array(
- PHPExcel_Style_Color::COLOR_BLACK,
- ),
- ),
- ),
- ),
- );
- protected $rowspan = array();
-
- public function __construct()
- {
- $this->readFilter = new PHPExcel_Reader_DefaultReadFilter();
- }
-
- protected function isValidFormat()
- {
-
- $data = fread($this->fileHandle, 2048);
- if ((strpos($data, '<') !== false) &&
- (strlen($data) !== strlen(strip_tags($data)))) {
- return true;
- }
- return false;
- }
-
- public function load($pFilename)
- {
-
- $objPHPExcel = new PHPExcel();
-
- return $this->loadIntoExisting($pFilename, $objPHPExcel);
- }
-
- public function setInputEncoding($pValue = 'ANSI')
- {
- $this->inputEncoding = $pValue;
- return $this;
- }
-
- public function getInputEncoding()
- {
- return $this->inputEncoding;
- }
-
- protected $dataArray = array();
- protected $tableLevel = 0;
- protected $nestedColumn = array('A');
- protected function setTableStartColumn($column)
- {
- if ($this->tableLevel == 0) {
- $column = 'A';
- }
- ++$this->tableLevel;
- $this->nestedColumn[$this->tableLevel] = $column;
- return $this->nestedColumn[$this->tableLevel];
- }
- protected function getTableStartColumn()
- {
- return $this->nestedColumn[$this->tableLevel];
- }
- protected function releaseTableStartColumn()
- {
- --$this->tableLevel;
- return array_pop($this->nestedColumn);
- }
- protected function flushCell($sheet, $column, $row, &$cellContent)
- {
- if (is_string($cellContent)) {
-
- if (trim($cellContent) > '') {
-
-
-
- $sheet->setCellValue($column . $row, $cellContent, true);
- $this->dataArray[$row][$column] = $cellContent;
- }
- } else {
-
-
- $this->dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent;
- }
- $cellContent = (string) '';
- }
- protected function processDomElement(DOMNode $element, $sheet, &$row, &$column, &$cellContent, $format = null)
- {
- foreach ($element->childNodes as $child) {
- if ($child instanceof DOMText) {
- $domText = preg_replace('/\s+/u', ' ', trim($child->nodeValue));
- if (is_string($cellContent)) {
-
- $cellContent .= $domText;
- } else {
-
-
- }
- } elseif ($child instanceof DOMElement) {
- $attributeArray = array();
- foreach ($child->attributes as $attribute) {
- $attributeArray[$attribute->name] = $attribute->value;
- }
- switch ($child->nodeName) {
- case 'meta':
- foreach ($attributeArray as $attributeName => $attributeValue) {
- switch ($attributeName) {
- case 'content':
-
-
- break;
- }
- }
- $this->processDomElement($child, $sheet, $row, $column, $cellContent);
- break;
- case 'title':
- $this->processDomElement($child, $sheet, $row, $column, $cellContent);
- $sheet->setTitle($cellContent);
- $cellContent = '';
- break;
- case 'span':
- case 'div':
- case 'font':
- case 'i':
- case 'em':
- case 'strong':
- case 'b':
- if ($cellContent > '') {
- $cellContent .= ' ';
- }
- $this->processDomElement($child, $sheet, $row, $column, $cellContent);
- if ($cellContent > '') {
- $cellContent .= ' ';
- }
- break;
- case 'hr':
- $this->flushCell($sheet, $column, $row, $cellContent);
- ++$row;
- if (isset($this->formats[$child->nodeName])) {
- $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
- } else {
- $cellContent = '----------';
- $this->flushCell($sheet, $column, $row, $cellContent);
- }
- ++$row;
-
- case 'br':
- if ($this->tableLevel > 0) {
-
- $cellContent .= "\n";
- } else {
-
- $this->flushCell($sheet, $column, $row, $cellContent);
- ++$row;
- }
- break;
- case 'a':
- foreach ($attributeArray as $attributeName => $attributeValue) {
- switch ($attributeName) {
- case 'href':
- $sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue);
- if (isset($this->formats[$child->nodeName])) {
- $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
- }
- break;
- }
- }
- $cellContent .= ' ';
- $this->processDomElement($child, $sheet, $row, $column, $cellContent);
- break;
- case 'h1':
- case 'h2':
- case 'h3':
- case 'h4':
- case 'h5':
- case 'h6':
- case 'ol':
- case 'ul':
- case 'p':
- if ($this->tableLevel > 0) {
-
- $cellContent .= "\n";
- $this->processDomElement($child, $sheet, $row, $column, $cellContent);
- } else {
- if ($cellContent > '') {
- $this->flushCell($sheet, $column, $row, $cellContent);
- $row++;
- }
- $this->processDomElement($child, $sheet, $row, $column, $cellContent);
- $this->flushCell($sheet, $column, $row, $cellContent);
- if (isset($this->formats[$child->nodeName])) {
- $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
- }
- $row++;
- $column = 'A';
- }
- break;
- case 'li':
- if ($this->tableLevel > 0) {
-
- $cellContent .= "\n";
- $this->processDomElement($child, $sheet, $row, $column, $cellContent);
- } else {
- if ($cellContent > '') {
- $this->flushCell($sheet, $column, $row, $cellContent);
- }
- ++$row;
- $this->processDomElement($child, $sheet, $row, $column, $cellContent);
- $this->flushCell($sheet, $column, $row, $cellContent);
- $column = 'A';
- }
- break;
- case 'table':
- $this->flushCell($sheet, $column, $row, $cellContent);
- $column = $this->setTableStartColumn($column);
- if ($this->tableLevel > 1) {
- --$row;
- }
- $this->processDomElement($child, $sheet, $row, $column, $cellContent);
- $column = $this->releaseTableStartColumn();
- if ($this->tableLevel > 1) {
- ++$column;
- } else {
- ++$row;
- }
- break;
- case 'thead':
- case 'tbody':
- $this->processDomElement($child, $sheet, $row, $column, $cellContent);
- break;
- case 'tr':
- $column = $this->getTableStartColumn();
- $cellContent = '';
- $this->processDomElement($child, $sheet, $row, $column, $cellContent);
- ++$row;
- break;
- case 'th':
- case 'td':
- $this->processDomElement($child, $sheet, $row, $column, $cellContent);
- while (isset($this->rowspan[$column . $row])) {
- ++$column;
- }
- $this->flushCell($sheet, $column, $row, $cellContent);
- if (isset($attributeArray['rowspan']) && isset($attributeArray['colspan'])) {
-
- $columnTo = $column;
- for ($i = 0; $i < $attributeArray['colspan'] - 1; $i++) {
- ++$columnTo;
- }
- $range = $column . $row . ':' . $columnTo . ($row + $attributeArray['rowspan'] - 1);
- foreach (\PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) {
- $this->rowspan[$value] = true;
- }
- $sheet->mergeCells($range);
- $column = $columnTo;
- } elseif (isset($attributeArray['rowspan'])) {
-
- $range = $column . $row . ':' . $column . ($row + $attributeArray['rowspan'] - 1);
- foreach (\PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) {
- $this->rowspan[$value] = true;
- }
- $sheet->mergeCells($range);
- } elseif (isset($attributeArray['colspan'])) {
-
- $columnTo = $column;
- for ($i = 0; $i < $attributeArray['colspan'] - 1; $i++) {
- ++$columnTo;
- }
- $sheet->mergeCells($column . $row . ':' . $columnTo . $row);
- $column = $columnTo;
- }
- ++$column;
- break;
- case 'body':
- $row = 1;
- $column = 'A';
- $content = '';
- $this->tableLevel = 0;
- $this->processDomElement($child, $sheet, $row, $column, $cellContent);
- break;
- default:
- $this->processDomElement($child, $sheet, $row, $column, $cellContent);
- }
- }
- }
- }
-
- public function loadIntoExisting($pFilename, PHPExcel $objPHPExcel)
- {
-
- $this->openFile($pFilename);
- if (!$this->isValidFormat()) {
- fclose($this->fileHandle);
- throw new PHPExcel_Reader_Exception($pFilename . " is an Invalid HTML file.");
- }
-
- fclose($this->fileHandle);
-
- while ($objPHPExcel->getSheetCount() <= $this->sheetIndex) {
- $objPHPExcel->createSheet();
- }
- $objPHPExcel->setActiveSheetIndex($this->sheetIndex);
-
- $dom = new domDocument;
-
- $loaded = $dom->loadHTML(mb_convert_encoding($this->securityScanFile($pFilename), 'HTML-ENTITIES', 'UTF-8'));
- if ($loaded === false) {
- throw new PHPExcel_Reader_Exception('Failed to load ' . $pFilename . ' as a DOM Document');
- }
-
- $dom->preserveWhiteSpace = false;
- $row = 0;
- $column = 'A';
- $content = '';
- $this->processDomElement($dom, $objPHPExcel->getActiveSheet(), $row, $column, $content);
-
- return $objPHPExcel;
- }
-
- public function getSheetIndex()
- {
- return $this->sheetIndex;
- }
-
- public function setSheetIndex($pValue = 0)
- {
- $this->sheetIndex = $pValue;
- return $this;
- }
-
- public function securityScan($xml)
- {
- $pattern = '/\\0?' . implode('\\0?', str_split('<!ENTITY')) . '\\0?/';
- if (preg_match($pattern, $xml)) {
- throw new PHPExcel_Reader_Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
- }
- return $xml;
- }
- }
|