I adapted a piece of code I found on internet in order to read a huge Excel file without facing a "gc limited exceeded" as I always got in POI.
It would be a great idea for you, to browse a xlsx file first. The understanding of the sheets and xml elements and attributes names is important.
To do that:
After you can adapt the code to your own business objects depending of your organisation in cells.
Have fun! (But don't forget to work :D )
My test file was 42,723 rows long.
It would be a great idea for you, to browse a xlsx file first. The understanding of the sheets and xml elements and attributes names is important.
To do that:
- rename your .xlsx file in .zip file.
- Extract the content of your zip file in a folder.
- Open the xl\worksheets\sheets1.xml (Notepad++ is recommanded)
- Open the xl\sharedStrings.xml (Notepadd++ is recommanded)
After you can adapt the code to your own business objects depending of your organisation in cells.
Have fun! (But don't forget to work :D )
package BL.Parser;
import BO.Files.ExcelPublicationFile;
import BO.Publication.RisPublication;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.*;
import org.xml.sax.*;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
import java.io.*;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* Created by casteven on 19/03/14.
*/
public class ExcelPublicationFileParser
{
public ExcelPublicationFile ParseFile(ExcelPublicationFile file) throws Exception {
OPCPackage pkg = OPCPackage.open(file);
XSSFReader r = new XSSFReader( pkg );
SharedStringsTable sst = r.getSharedStringsTable(); //XML file containing all the String values, referenced by index
XMLReader parser = fetchSheetParser(sst);
Iterator<InputStream> sheets = r.getSheetsData();
//Browsing sheets 1 and extracting data.
if (sheets.hasNext())
{
InputStream sheet = sheets.next();
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource);
sheet.close();
}
file.setPublications(SheetHandler.getPubs());
return file;
}
public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
ContentHandler handler = new SheetHandler(sst);
parser.setContentHandler(handler);
return parser;
}
/**
* See org.xml.sax.helpers.DefaultHandler javadocs
*/
private static class SheetHandler extends DefaultHandler {
private SharedStringsTable sst;
private String lastContents;
private RisPublication pub=new RisPublication();
private static List<RisPublication> pubs=new ArrayList<RisPublication>();
private int column=0;
private int row=0;
private SheetHandler(SharedStringsTable sst) {
this.sst = sst;
pubs=new ArrayList<RisPublication>();
}
/*
GETTER AND SETTER
*/
public static List<RisPublication> getPubs() {
return pubs;
}
public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException
{
// Clear contents cache
lastContents = "";
}
public void endElement(String uri, String localName, String name) throws SAXException {
// v => index of the content of a cell.
if(name.equals("v")) {
try {
int idx = Integer.parseInt(lastContents); //Catch the ID in int
lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString(); // Get the value referenced by index ()
} catch (NumberFormatException e) {
}
}
//If we are reading a cell and columns is not the first
if(name.equals("c") && row>0) {
switch (column)
{
case 0:pub.setU3(lastContents);
break;
case 1:pub.setU4(lastContents);
break;
case 2:pub.setID(lastContents);
break;
case 3:pub.setT1(lastContents);
break;
case 4:String author=lastContents;
List<String> authors=Util.List_Util.convertCommaSeparatedStringToListOfString(author);
for (String s:authors)
pub.addA1(s);
break;
case 5:pub.setY1(lastContents);
break;
case 6:pub.setN2(lastContents);
case 7:String keyword=lastContents;
List<String> keywords=Util.List_Util.convertCommaSeparatedStringToListOfString(keyword);
for (String s:keywords)
pub.addKW(s);
break;
case 8:pub.setJF(lastContents);
break;
case 9:pub.setJA(lastContents);
break;
case 10:pub.setVL(lastContents);
break;
case 11:pub.setIS(lastContents);
break;
case 12:pub.setSP(lastContents);
break;
case 13:pub.setEP(lastContents);
break;
case 14:pub.setCY(lastContents);
break;
case 15:pub.setPB(lastContents);
break;
case 16:pub.setSN(lastContents);
break;
case 17:pub.setM1(lastContents);
break;
case 18:pub.setU1(lastContents);
break;
case 19:pub.setU2(lastContents);
break;
case 20:pub.setU5(lastContents);
break;
case 21:pub.setUR(lastContents);
}
column++;
}
//If it is the end of a row, save the current publication. An create a new one
if(name.equals("row")) {
if (row>0)
pubs.add(pub);
pub=new RisPublication();
row++;
column=0;
}
}
//Extracting the content of an element
public void characters(char[] ch, int start, int length) throws SAXException {
lastContents += new String(ch, start, length);
}
}
}
So where should we get these?
ReplyDeleteimport BO.Files.ExcelPublicationFile;
import BO.Publication.RisPublication;
is it your special package?
ReplyDeleteHi, these are not needed to read the Excel file. In my code I save the content of the file within an object called RisPublication.
DeleteThe ExcelPublicationFile package extends or implement the java files class if my memories are correct.