You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
@jbaiter I installed new 0.6.0 and I'm doing some check with ALTO. I generate ALTO with pdfalto from PDF. For a blank pdf page I have an empty ALTO as this:
That means without any TextBlock/TextLine/String.
Indexing Solr gives this error:
2021-05-13 09:19:21.360 ERROR (qtp1997859171-641) [ x:archipelago] o.a.s.h.RequestHandlerBase org.apache.solr.common.SolrException: Exception writing document id g0qa5y-default_solr_index-strawberryfield_flavor_datasource/151:6:en:cc9b80b1-bd9a-4657-94df-4d62785e08c5:ocr to the index; possible analysis error.
at org.apache.solr.update.DirectUpdateHandler2.addDoc(DirectUpdateHandler2.java:256)
at org.apache.solr.update.processor.RunUpdateProcessorFactory$RunUpdateProcessor.processAdd(RunUpdateProcessorFactory.java:73)
at org.apache.solr.update.processor.UpdateRequestProcessor.processAdd(UpdateRequestProcessor.java:55)
at org.apache.solr.update.processor.DistributedUpdateProcessor.doLocalAdd(DistributedUpdateProcessor.java:259)
at org.apache.solr.update.processor.DistributedUpdateProcessor.doVersionAdd(DistributedUpdateProcessor.java:498)
at org.apache.solr.update.processor.DistributedUpdateProcessor.lambda$versionAdd$0(DistributedUpdateProcessor.java:339)
at org.apache.solr.update.VersionBucket.runWithLock(VersionBucket.java:50)
at org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:339)
at org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:225)
at org.apache.solr.update.processor.LogUpdateProcessorFactory$LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:106)
at org.apache.solr.handler.loader.XMLLoader.processUpdate(XMLLoader.java:263)
at org.apache.solr.handler.loader.XMLLoader.load(XMLLoader.java:190)
at org.apache.solr.handler.UpdateRequestHandler$1.load(UpdateRequestHandler.java:97)
at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:68)
at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:214)
at org.apache.solr.core.SolrCore.execute(SolrCore.java:2627)
at org.apache.solr.servlet.HttpSolrCall.execute(HttpSolrCall.java:795)
at org.apache.solr.servlet.HttpSolrCall.call(HttpSolrCall.java:568)
at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:415)
at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:345)
at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1596)
at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:545)
at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)
at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:590)
at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235)
at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1610)
at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233)
at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1300)
at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188)
at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:485)
at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1580)
at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186)
at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1215)
at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:221)
at org.eclipse.jetty.server.handler.InetAccessHandler.handle(InetAccessHandler.java:177)
at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:146)
at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
at org.eclipse.jetty.rewrite.handler.RewriteHandler.handle(RewriteHandler.java:322)
at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
at org.eclipse.jetty.server.Server.handle(Server.java:500)
at org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:383)
at org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:547)
at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:375)
at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:273)
at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)
at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:103)
at org.eclipse.jetty.io.ChannelEndPoint$2.run(ChannelEndPoint.java:117)
at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:336)
at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:313)
at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:171)
at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:129)
at org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:375)
at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:806)
at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:938)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalStateException: Current event not START_ELEMENT
at com.ctc.wstx.sr.BasicStreamReader.getAttributeValue(BasicStreamReader.java:632)
at de.digitalcollections.solrocr.formats.alto.AltoParser.readNext(AltoParser.java:59)
at de.digitalcollections.solrocr.formats.OcrParser.<init>(OcrParser.java:106)
at de.digitalcollections.solrocr.formats.alto.AltoParser.<init>(AltoParser.java:27)
at de.digitalcollections.solrocr.formats.alto.AltoFormat.getParser(AltoFormat.java:38)
at de.digitalcollections.solrocr.model.OcrFormat.filter(OcrFormat.java:90)
at de.digitalcollections.solrocr.lucene.filters.OcrCharFilterFactory.create(OcrCharFilterFactory.java:51)
at org.apache.solr.analysis.TokenizerChain.initReader(TokenizerChain.java:97)
at org.apache.lucene.analysis.AnalyzerWrapper.initReader(AnalyzerWrapper.java:156)
at org.apache.lucene.analysis.AnalyzerWrapper.initReader(AnalyzerWrapper.java:156)
at org.apache.lucene.analysis.Analyzer.tokenStream(Analyzer.java:197)
at org.apache.lucene.document.Field.tokenStream(Field.java:513)
at org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:915)
at org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:524)
at org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:488)
at org.apache.lucene.index.DocumentsWriterPerThread.updateDocuments(DocumentsWriterPerThread.java:208)
at org.apache.lucene.index.DocumentsWriter.updateDocuments(DocumentsWriter.java:419)
at org.apache.lucene.index.IndexWriter.updateDocuments(IndexWriter.java:1471)
at org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1757)
at org.apache.solr.update.DirectUpdateHandler2.updateDocOrDocValues(DirectUpdateHandler2.java:983)
at org.apache.solr.update.DirectUpdateHandler2.doNormalUpdate(DirectUpdateHandler2.java:347)
at org.apache.solr.update.DirectUpdateHandler2.addDoc0(DirectUpdateHandler2.java:294)
at org.apache.solr.update.DirectUpdateHandler2.addDoc(DirectUpdateHandler2.java:241)
... 56 more
Is this by design so I have to add at least an empty/space String element ?
Thanks for all.
The text was updated successfully, but these errors were encountered:
giancarlobi
changed the title
Solr return error with empty ALTO, by design ?
Solr returns error with empty ALTO, by design ?
May 13, 2021
@jbaiter I installed new 0.6.0 and I'm doing some check with ALTO. I generate ALTO with pdfalto from PDF. For a blank pdf page I have an empty ALTO as this:
That means without any TextBlock/TextLine/String.
Indexing Solr gives this error:
Is this by design so I have to add at least an empty/space String element ?
Thanks for all.
The text was updated successfully, but these errors were encountered: