spreadsheetreader_csv.php (6712B)
1 <?php 2 /** 3 * Class for parsing CSV files 4 * 5 * @author Martins Pilsetnieks 6 */ 7 class SpreadsheetReader_CSV implements Iterator, Countable { 8 /** 9 * @var array Options array, pre-populated with the default values. 10 */ 11 private $Options = array( 12 'Delimiter' => ';', 13 'Enclosure' => '"' 14 ); 15 16 private $Encoding = 'UTF-8'; 17 private $BOMLength = 0; 18 19 /** 20 * @var resource File handle 21 */ 22 private $Handle = false; 23 24 private $Filepath = ''; 25 26 private $Index = 0; 27 28 private $CurrentRow = null; 29 30 /** 31 * @param string Path to file 32 * @param array Options: 33 * Enclosure => string CSV enclosure 34 * Separator => string CSV separator 35 */ 36 public function __construct($Filepath, array $Options = null) 37 { 38 $this -> Filepath = $Filepath; 39 40 if (!is_readable($Filepath)) 41 { 42 throw new Exception('SpreadsheetReader_CSV: File not readable ('.$Filepath.')'); 43 } 44 45 // For safety's sake 46 @ini_set('auto_detect_line_endings', true); 47 48 $this -> Options = array_merge($this -> Options, $Options); 49 $this -> Handle = fopen($Filepath, 'r'); 50 51 // Checking the file for byte-order mark to determine encoding 52 $BOM16 = bin2hex(fread($this -> Handle, 2)); 53 if ($BOM16 == 'fffe') 54 { 55 $this -> Encoding = 'UTF-16LE'; 56 //$this -> Encoding = 'UTF-16'; 57 $this -> BOMLength = 2; 58 } 59 elseif ($BOM16 == 'feff') 60 { 61 $this -> Encoding = 'UTF-16BE'; 62 //$this -> Encoding = 'UTF-16'; 63 $this -> BOMLength = 2; 64 } 65 66 if (!$this -> BOMLength) 67 { 68 fseek($this -> Handle, 0); 69 $BOM32 = bin2hex(fread($this -> Handle, 4)); 70 if ($BOM32 == '0000feff') 71 { 72 //$this -> Encoding = 'UTF-32BE'; 73 $this -> Encoding = 'UTF-32'; 74 $this -> BOMLength = 4; 75 } 76 elseif ($BOM32 == 'fffe0000') 77 { 78 //$this -> Encoding = 'UTF-32LE'; 79 $this -> Encoding = 'UTF-32'; 80 $this -> BOMLength = 4; 81 } 82 } 83 84 fseek($this -> Handle, 0); 85 $BOM8 = bin2hex(fread($this -> Handle, 3)); 86 if ($BOM8 == 'efbbbf') 87 { 88 $this -> Encoding = 'UTF-8'; 89 $this -> BOMLength = 3; 90 } 91 92 // Seeking the place right after BOM as the start of the real content 93 if ($this -> BOMLength) 94 { 95 fseek($this -> Handle, $this -> BOMLength); 96 } 97 98 // Checking for the delimiter if it should be determined automatically 99 if (!$this -> Options['Delimiter']) 100 { 101 // fgetcsv needs single-byte separators 102 $Semicolon = ';'; 103 $Tab = "\t"; 104 $Comma = ','; 105 106 // Reading the first row and checking if a specific separator character 107 // has more columns than others (it means that most likely that is the delimiter). 108 $SemicolonCount = count(fgetcsv($this -> Handle, null, $Semicolon)); 109 fseek($this -> Handle, $this -> BOMLength); 110 $TabCount = count(fgetcsv($this -> Handle, null, $Tab)); 111 fseek($this -> Handle, $this -> BOMLength); 112 $CommaCount = count(fgetcsv($this -> Handle, null, $Comma)); 113 fseek($this -> Handle, $this -> BOMLength); 114 115 $Delimiter = $Semicolon; 116 if ($TabCount > $SemicolonCount || $CommaCount > $SemicolonCount) 117 { 118 $Delimiter = $CommaCount > $TabCount ? $Comma : $Tab; 119 } 120 121 $this -> Options['Delimiter'] = $Delimiter; 122 } 123 } 124 125 /** 126 * Returns information about sheets in the file. 127 * Because CSV doesn't have any, it's just a single entry. 128 * 129 * @return array Sheet data 130 */ 131 public function Sheets() 132 { 133 return array(0 => basename($this -> Filepath)); 134 } 135 136 /** 137 * Changes sheet to another. Because CSV doesn't have any sheets 138 * it just rewinds the file so the behaviour is compatible with other 139 * sheet readers. (If an invalid index is given, it doesn't do anything.) 140 * 141 * @param bool Status 142 */ 143 public function ChangeSheet($Index) 144 { 145 if ($Index == 0) 146 { 147 $this -> rewind(); 148 return true; 149 } 150 return false; 151 } 152 153 // !Iterator interface methods 154 /** 155 * Rewind the Iterator to the first element. 156 * Similar to the reset() function for arrays in PHP 157 */ 158 public function rewind() 159 { 160 fseek($this -> Handle, $this -> BOMLength); 161 $this -> CurrentRow = null; 162 $this -> Index = 0; 163 } 164 165 /** 166 * Return the current element. 167 * Similar to the current() function for arrays in PHP 168 * 169 * @return mixed current element from the collection 170 */ 171 public function current() 172 { 173 if ($this -> Index == 0 && is_null($this -> CurrentRow)) 174 { 175 $this -> next(); 176 $this -> Index--; 177 } 178 return $this -> CurrentRow; 179 } 180 181 /** 182 * Move forward to next element. 183 * Similar to the next() function for arrays in PHP 184 */ 185 public function next() 186 { 187 $this -> CurrentRow = array(); 188 189 // Finding the place the next line starts for UTF-16 encoded files 190 // Line breaks could be 0x0D 0x00 0x0A 0x00 and PHP could split lines on the 191 // first or the second linebreak leaving unnecessary \0 characters that mess up 192 // the output. 193 if ($this -> Encoding == 'UTF-16LE' || $this -> Encoding == 'UTF-16BE') 194 { 195 while (!feof($this -> Handle)) 196 { 197 // While bytes are insignificant whitespace, do nothing 198 $Char = ord(fgetc($this -> Handle)); 199 if (!$Char || $Char == 10 || $Char == 13) 200 { 201 continue; 202 } 203 else 204 { 205 // When significant bytes are found, step back to the last place before them 206 if ($this -> Encoding == 'UTF-16LE') 207 { 208 fseek($this -> Handle, ftell($this -> Handle) - 1); 209 } 210 else 211 { 212 fseek($this -> Handle, ftell($this -> Handle) - 2); 213 } 214 break; 215 } 216 } 217 } 218 219 $this -> Index++; 220 $this -> CurrentRow = fgetcsv($this -> Handle, null, $this -> Options['Delimiter'], $this -> Options['Enclosure']); 221 222 if ($this -> CurrentRow) 223 { 224 // Converting multi-byte unicode strings 225 // and trimming enclosure symbols off of them because those aren't recognized 226 // in the relevan encodings. 227 if ($this -> Encoding != 'ASCII' && $this -> Encoding != 'UTF-8') 228 { 229 $Encoding = $this -> Encoding; 230 foreach ($this -> CurrentRow as $Key => $Value) 231 { 232 $this -> CurrentRow[$Key] = trim(trim( 233 mb_convert_encoding($Value, 'UTF-8', $this -> Encoding), 234 $this -> Options['Enclosure'] 235 )); 236 } 237 238 } 239 } 240 241 return $this -> CurrentRow; 242 } 243 244 /** 245 * Return the identifying key of the current element. 246 * Similar to the key() function for arrays in PHP 247 * 248 * @return mixed either an integer or a string 249 */ 250 public function key() 251 { 252 return $this -> Index; 253 } 254 255 /** 256 * Check if there is a current element after calls to rewind() or next(). 257 * Used to check if we've iterated to the end of the collection 258 * 259 * @return boolean FALSE if there's nothing more to iterate over 260 */ 261 public function valid() 262 { 263 return ($this -> CurrentRow || !feof($this -> Handle)); 264 } 265 266 // !Countable interface method 267 /** 268 * Ostensibly should return the count of the contained items but this just returns the number 269 * of rows read so far. It's not really correct but at least coherent. 270 */ 271 public function count() 272 { 273 return $this -> Index + 1; 274 } 275 } 276 277 ?>