要读取大量数据,需要 ReadFilter,指定读取范围,下面是我的 ReadFilter 类
<?php namespace common\models; class MyExcelReadFilter implements \PhpOffice\PhpSpreadsheet\Reader\IReadFilter { private $startRowNo; // 如 1 private $endRowNo; // 如 1000 private $startColNo; // 如 1 private $endColNo; // 如 50 public function __construct($startRowNo= 1, $endRowNo = 1000, $startColNo = 1, $endColNo = 50) { $this->startRowNo = $startRowNo; $this->endRowNo = $endRowNo; $this->startColNo = $startColNo; $this->endColNo = $endColNo; } public function setRows($startRowNo, $rowCount) { $this->startRowNo = $startRowNo; $this->endRowNo = $startRowNo + $rowCount - 1; } public function setColumns($startColNo, $columnCount) { $this->startColNo = $startColNo; $this->endColNo = $startColNo + $columnCount - 1; } public function setColumnsByLetter($startColLetter, $endColLetter) { $this->startColNo = self::calcColNo($startColLetter); $this->endColNo = self::calcColNo($endColLetter); } /** * @inheritDoc */ public function readCell($column, $row, $worksheetName = '') { $colNo = self::calcColNo($column); if ($row >= $this->startRowNo && $row <= $this->endRowNo && $colNo >= $this->startColNo && $colNo <= $this->endColNo) { return true; } return false; } /** * @param $colLetter string Excel表格中的列号,例如 'A'、'Z'、'AA'、'AZ' * @return float|int 返回整数列号,从 1 开始,即 'A' 列 对应 1 列 */ public static function calcColNo(string $colLetter) { $len = strlen($colLetter); $colNo = 0; $weight = 1; for ($i = $len-1; $i >= 0; --$i) { $colNo += (ord($colLetter[$i]) - ord('A') + 1) * $weight; $weight *= 26; } return $colNo; } /** * @param int $colNo 整数列号,从 1 开始 * @return string 返回 Excel表格形式的列号,例如 'A'、'Z'、'AA'、'AZ',1列 对应 'A'列 */ public static function calcColLetter(int $colNo) { $letters = range('A', 'Z'); // $letters[0] = 'A', $letters[25] = 'Z' $colLetter = ''; $base = 26; do { --$colNo; // 改成基于下标 0 $r = $colNo % $base; $colLetter = $letters[$r] . $colLetter; $colNo = intval($colNo/$base); // 必须强制取整 } while ($colNo > 0); return $colLetter; } }
参考 (34条消息) 使用phpspreadsheet导出数据时内存溢出处理_x554462的博客-CSDN博客_phpspreadsheet内存溢出xhttps://blog.csdn.net/x554462/article/details/89102800写了 FileCache 类 (经过测试发现 FileCache 可以一定程度避免内存溢出,但读取速度比较慢)
<?php namespace common\models; use PhpOffice\PhpSpreadsheet\Reader\Exception; class MyExcelFileCache implements \Psr\SimpleCache\CacheInterface { const CACHE_LINES = 3000; private $cacheKeyDict = []; // 键字典 private $cache = []; // 一次性读取的数据装入该缓存,避免短期重复打开文件 private $fileHandleDict = []; // 文件指针字典 private $cacheDir; // 存放文件 cache 的目录 private $filePrefix; // 唯一性前缀保证文件名唯一性 /** * 递归删除缓存目录 * @param $path string */ private function delCacheDir(string $path) { if (is_dir($path)) { // 确保 $path 是目录 foreach (scandir($path) as $f) { // 扫描该目录下每个 目录或文件 if (is_dir($path . $f)) { // 是子目录 if ($f != '.' && $f != '..') { $this->delCacheDir($path . $f . '/'); // 递归删除子目录内部 @rmdir($path. $f . '/'); // 删除子目录本身 } } else { // 是文件 @unlink($path. $f); // 删除文件 } } } } /** * @param $key string * @return string 根据 key 生成文件名 */ private function getFilenameByKey($key) { $a = explode('.', $key); $end = array_pop($a); $dir = $this->cacheDir . implode('_', $a); if (!is_dir($dir)) { mkdir($dir, 0777, true); } $line = ''; $len = strlen($end); for ($i = 0; $i < $len; $i++) { if (is_numeric($end[$i])) $line.= $end[$i]; } $suf = intval(round($line / self::CACHE_LINES)); return $dir . '/' . $this->filePrefix . $suf; } /** * 根据键换算出文件名,然后到字典中查询文件指针 * @param $key string * @return mixed|resource 文件指针 * @throws Exception */ private function getFileHandleByKey(string $key) { $filename = $this->getFilenameByKey($key); if (!array_key_exists($filename, $this->fileHandleDict)) { // 字典中没有对应文件指针 $fp = fopen($filename, 'w+'); // 新建文件 if (!$fp) { throw new Exception('生成缓存文件失败(MyExcelFileCache)'); } $this->fileHandleDict[$filename] = $fp; // 向字典添加新建的指针 } return $this->fileHandleDict[$filename]; // 返回字典中的指针 } private function removeKeyPrefix($key) { return preg_replace('/^phpspreadsheet\./', '', $key); // 移除前缀 phpspreadsheet. } public function __construct($cacheDir) { $this->cacheDir = rtrim($cacheDir, '/') . '/'; $this->filePrefix = uniqid(); } public function __destruct() { $this->clear(); } /** * @inheritDoc */ public function get($key, $default = null) { $key = $this->removeKeyPrefix($key); if ($this->has($key)) { // 文件缓存的键字典中有该键 (即缓存过的) $seek = $this->cacheKeyDict[$key]; // 取出在文件缓存中的定位信息 if (array_key_exists($key, $this->cache) && $this->cache[$key]['seek'] == $seek) { // cache数组中是否已经提取过 return $this->cache[$key]['data']; // 已经提取过,直接返回 } $fp = $this->getFileHandleByKey($key); // 到文件指针字典中查询出文件指针 $this->cache = []; fseek($fp, 0); // 定位到文件开头 while (!feof($fp)) { $data = fgets($fp); // 读取一行 $data = json_decode(trim($data), 1); // 解码成关联数组 if ($data['key'] == $key && $data['seek'] == $seek) { // 该行对应这次查询的键,保存到 $default $default = unserialize($data['data']); } $this->cache[$data['key']] = [ // 读取过程中,把所有行都缓存到 cache数组,避免短期反复打开该文件 'data' => unserialize($data['data']), 'seek' => $data['seek'] ]; } } return $default; } /** * @inheritDoc */ public function set($key, $value, $ttl = null) { $key = $this->removeKeyPrefix($key); if ($this->has($key) && $this->get($key) == $value) return true; // 之前已经缓存过 $fp = $this->getFileHandleByKey($key); // 取出文件指针 (内容从未缓存或内容已经变更) fseek($fp, 0, SEEK_END); // 定位到文件尾部 $seek = ftell($fp); // 获得文件指针位置 $this->cacheKeyDict[$key] = $seek; // 在字典登记该键 (键对应键,值为数据在文件缓存中的起始位置) fwrite($fp, json_encode([ 'key' => $key, 'data' => serialize($value), 'seek' => $seek ]).PHP_EOL); // 在文件缓存写入数据 (json编码的关联数组) unset($value); // “释放”已经写入缓存的变量 return true; } /** * @inheritDoc */ public function delete($key) { $key = $this->removeKeyPrefix($key); unset($this->cacheKeyDict[$key]); // 从键字典释放该键 return true; } /** * @inheritDoc */ public function clear() { $this->cacheKeyDict = []; // 键字典清空 foreach ($this->fileHandleDict as $fileHandle) { // 文件指针字典中所有文件关闭 isset($fileHandle) && fclose($fileHandle); } $this->delCacheDir($this->cacheDir); // 递归删除缓存目录 return true; } /** * @inheritDoc */ public function getMultiple($keys, $default = null) { $result = []; foreach ($keys as $key) $result[$key] = $this->get($key, $default); return $result; } /** * @inheritDoc */ public function setMultiple($values, $ttl = null) { foreach ($values as $key => $value) { $this->set($key, $value); } return true; } /** * @inheritDoc */ public function deleteMultiple($keys) { foreach ($keys as $key) $this->delete($key); return true; } /** * @inheritDoc */ public function has($key) { $key = $this->removeKeyPrefix($key); return array_key_exists($key, $this->cacheKeyDict); // 缓存键字典中是否有该键 } }
然后是自己的 3种 读取方式的类(直接读取、trunk方式循环读取、带FileCache读取)。这个类不完全通用,因为是针对特定问题处理的。
<?php namespace common\models; use PhpOffice\PhpSpreadsheet\IOFactory; use PhpOffice\PhpSpreadsheet\Reader\Xlsx; use PhpOffice\PhpSpreadsheet\Settings; class MyExcelUtility { const CHUNK_SIZE = 1000; public static function excelDateToDate(int $nDate, $format = 'Y-m-d') { return date($format, ($nDate-25569) * 86400); // Excel bug, 1900 年是闰年 } public static function excelDateToUnixStamp(int $nDate) { return ($nDate-25569) * 86400; } /** * 获得第一个 sheet 总的行数和列数 * @param $reader Xlsx IOFactory 创建的实例(根据后缀名创建不同类的实例) * @param $filePath string * @return array ['totalRows' => .., 'totalColumns' => ...] */ public static function getSheetInfo($reader, $filePath) { $info = $reader->listWorkSheetInfo($filePath); return ['totalRows' => intval($info[0]['totalRows']), 'totalColumns' => intval($info[0]['totalColumns'])]; } public static function read($filePath, $maxRowNo, $maxColNo, $inputFileType = 'Xlsx', $calcColNos = [3, 4]) { $reader = IOFactory::createReader($inputFileType); // 创建指定类型的 reader $reader->setReadDataOnly(true); // 非只读似乎有问题 Xlsx.php 884行 Illegal offset type 警告错误 $filterSubnet = new MyExcelReadFilter(1, $maxRowNo, 1, $maxColNo); $reader->setReadFilter($filterSubnet); // 设定读取范围 $spreadsheet = $reader->load($filePath); $result = []; for ($rowNo = 1; $rowNo <= $maxRowNo; $rowNo++) { for ($colNo = 1; $colNo <= $maxColNo; $colNo++) { if (in_array($colNo, $calcColNos)) $result[$rowNo][$colNo] = $spreadsheet->getSheet(0) ->getCell(MyExcelReadFilter::calcColLetter($colNo) . $rowNo) ->getCalculatedValue(); // 都当作计算单元格读取可能比较慢,所以这里对这些列单独处理 else $result[$rowNo][$colNo] = $spreadsheet->getSheet(0) ->getCell(MyExcelReadFilter::calcColLetter($colNo) . $rowNo) ->getValue(); } } $spreadsheet->__destruct(); $spreadsheet = null; unset($spreadsheet); // 及时释放 $spreadsheet 对象有助于防止内存溢出 $reader = null; unset($reader); return $result; } public static function readWithTrunk($filePath, $maxRowNo, $maxColNo, $inputFileType = 'Xlsx', $calcColNos = [3, 4], $chunkSize = self::CHUNK_SIZE) { $result = []; for ($startRowNo = 1; $startRowNo <= $maxColNo; $startRowNo += $chunkSize) { $reader = IOFactory::createReader($inputFileType); $reader->setReadDataOnly(true); $filterSubnet = new MyExcelReadFilter(); $reader->setReadFilter($filterSubnet); $filterSubnet->setColumnsByLetter('A', 'AZ'); $filterSubnet->setRows($startRowNo, $chunkSize); $spreadsheet = $reader->load($filePath); for ($rowNo = 1; $rowNo <= $maxRowNo; $rowNo++) { for ($colNo = 1; $colNo <= $maxColNo; $colNo++) { if (in_array($colNo, $calcColNos)) $result[$rowNo][$colNo] = $spreadsheet->getSheet(0) ->getCell(MyExcelReadFilter::calcColLetter($colNo) . $rowNo) ->getCalculatedValue(); // 都当作计算单元格读取可能比较慢,所以这里对这些列单独处理 else $result[$rowNo][$colNo] = $spreadsheet->getSheet(0) ->getCell(MyExcelReadFilter::calcColLetter($colNo) . $rowNo) ->getValue(); } } $spreadsheet->__destruct(); $spreadsheet = null; unset($spreadsheet); // 及时释放 $spreadsheet 对象有助于防止内存溢出 $reader = null; unset($reader); } return $result; } public static function readWithFileCache($filePath, $maxRowNo, $maxColNo, $inputFileType = 'Xlsx', $calcColNos = [3, 4], $cacheDir = '/tmp/excel_cache') { Settings::setCache(new MyExcelFileCache($cacheDir)); $reader = IOFactory::createReader($inputFileType); // 创建指定类型的 reader $reader->setReadDataOnly(true); $filterSubnet = new MyExcelReadFilter(1, $maxRowNo, 1, $maxColNo); $reader->setReadFilter($filterSubnet); // 设定读取范围 $spreadsheet = $reader->load($filePath); $result = []; for ($rowNo = 1; $rowNo <= $maxRowNo; $rowNo++) { for ($colNo = 1; $colNo <= $maxColNo; $colNo++) { if (in_array($colNo, $calcColNos)) $result[$rowNo][$colNo] = $spreadsheet->getSheet(0) ->getCell(MyExcelReadFilter::calcColLetter($colNo) . $rowNo) ->getCalculatedValue(); // 都当作计算单元格读取可能比较慢,所以这里对这些列单独处理 else $result[$rowNo][$colNo] = $spreadsheet->getSheet(0) ->getCell(MyExcelReadFilter::calcColLetter($colNo) . $rowNo) ->getValue(); } } // $spreadsheet->__destruct(); // 不注释这句会造成 能读取完,但出现段错误 // $spreadsheet = null; // unset($spreadsheet); // $reader = null; // unset($reader); return $result; } public static function getHeaderRanger($result, $maxHeaderRows, $maxColNo) // 表头不超过 $maxHeaderRows 行 $maxColNo 列 { $lastColNo = $maxColNo; $longestRowNo = -1; while ($lastColNo >= 1) { for ($rowNo = 1; $rowNo <= $maxHeaderRows; $rowNo++) { if (!empty($result[$rowNo][$lastColNo])) { $longestRowNo = $rowNo; break; } } if ($longestRowNo > 0) break; --$lastColNo; } return ['longestRowNo' => $longestRowNo, 'lastColNo' => $lastColNo]; } }
测试代码
public function actionXlsx1() { $xlsxFile = User::getFilePath('test1.xlsx', '', true, '@backend/runtime'); $start = microtime(true); $result = MyExcelUtility::read($xlsxFile, 500, 50); $end = microtime(true); for ($i = 1; $i <= 5; $i++) { for ($j = 1; $j <= 20; $j++) { echo $result[$i][$j]."\t"; } echo PHP_EOL; } echo number_format($end - $start, 10). " sec for read\n"; // 1000段错误,无法得到结果 500 -> 29.4952 } public function actionXlsx2() { $xlsxFile = User::getFilePath('test1.xlsx', '', true, '@backend/runtime'); $start = microtime(true); $result = MyExcelUtility::readWithTrunk($xlsxFile, 1000, 50, 'Xlsx', [3,4], 400); $end = microtime(true); for ($i = 1; $i <= 5; $i++) { for ($j = 1; $j <= 20; $j++) { echo $result[$i][$j]."\t"; } echo PHP_EOL; } echo number_format($end - $start, 10). " sec for read with trunk\n"; // 1000/400->40.4761 1000/240->39.2139 1000/500->39.5506 } public function actionXlsx3() { $xlsxFile = User::getFilePath('test1.xlsx', '', true, '@backend/runtime'); $start = microtime(true); $result = MyExcelUtility::readWithFileCache($xlsxFile, 1000, 50); $end = microtime(true); for ($i = 1; $i <= 5; $i++) { for ($j = 1; $j <= 20; $j++) { echo $result[$i][$j]."\t"; } echo PHP_EOL; } echo number_format($end - $start, 10). " sec for read with file cache\n"; // 51.2165 }
总结:
在3个readXxx函数中,如果没有 $reader->setReadDataOnly(true) ,即设置为只读,在 Xlsx.php 文件的 884行 会出现 Illegal offset type 信息的php警告错误
仔细核对了github上的代码,发现已经fix了,哈哈,就是 $comment['authorId'] 是一个对象,作为数组的键会出这个错误,github上已经前面加了 (int)强制类型转换了
对于带FileCache的读取函数,读取完成后不能使用 $spreadsheet->__destruct(); 去释放内存,如果这么做,会出现段错误 (内存越界之类)。FileCache实例对象是被动被reader使用的,即读取前设置而已。
直接读取方式还是比较容易出现段错误的,我的例子中,虚拟机ubuntu18.04 内存8GB php7.2 控制台命令方式 50列 1000行就出现段错误,500行不会。500行耗时29.4952秒。
Trunk方式读取,也就是循环读取释放的方式,参考Reading large File increases Execution time & Memory (E.g file with 500000 records) · Issue #629 · PHPOffice/PhpSpreadsheet · GitHubhttps://github.com/PHPOffice/PhpSpreadsheet/issues/629
,不会出现段错误,而且在3种里面似乎效率最高。测试中,1000行分400行一次,耗时40.4761秒,分240行一次,39.2139行,分500行一次,39.5506行,基本上无差别,这似乎表明,时间并不是耗费在重复打开解压同一个文件上(Xlsx等带x的Office文件都是zip文件),而是耗费在解析这些单元格上。(10000行,400行一次,耗时473.7826秒)
带FileCache读取,1000行耗时51.2165秒(10000行耗时598.8875秒),没有速度优势,不过可以避免内存耗尽。另外,如果需要纵向的大量数据的公式计算,也只能用这种方式。