-
Notifications
You must be signed in to change notification settings - Fork 781
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
TIKA-3459 -- integrate Drew Noakes metadata-extractor as the underlyi…
…ng MP4 parser.
- Loading branch information
Showing
6 changed files
with
681 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
302 changes: 302 additions & 0 deletions
302
tika-parsers/src/main/java/org/apache/tika/parser/mp4/NoakesMP4Parser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,302 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.tika.parser.mp4; | ||
|
||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.nio.file.Files; | ||
import java.text.DecimalFormat; | ||
import java.text.NumberFormat; | ||
import java.util.Arrays; | ||
import java.util.Collection; | ||
import java.util.Collections; | ||
import java.util.Date; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Locale; | ||
import java.util.Map; | ||
import java.util.Optional; | ||
import java.util.Set; | ||
|
||
import com.drew.imaging.mp4.Mp4Reader; | ||
import com.drew.metadata.Directory; | ||
import com.drew.metadata.MetadataException; | ||
import com.drew.metadata.mp4.Mp4BoxHandler; | ||
import com.drew.metadata.mp4.Mp4Directory; | ||
import com.drew.metadata.mp4.media.Mp4SoundDirectory; | ||
import com.drew.metadata.mp4.media.Mp4VideoDirectory; | ||
import org.apache.commons.lang3.StringUtils; | ||
import org.xml.sax.ContentHandler; | ||
import org.xml.sax.SAXException; | ||
|
||
import org.apache.tika.exception.RuntimeSAXException; | ||
import org.apache.tika.exception.TikaException; | ||
import org.apache.tika.io.TemporaryResources; | ||
import org.apache.tika.io.TikaInputStream; | ||
import org.apache.tika.metadata.Metadata; | ||
import org.apache.tika.metadata.Property; | ||
import org.apache.tika.metadata.TikaCoreProperties; | ||
import org.apache.tika.metadata.XMPDM; | ||
import org.apache.tika.mime.MediaType; | ||
import org.apache.tika.parser.AbstractParser; | ||
import org.apache.tika.parser.ParseContext; | ||
import org.apache.tika.sax.XHTMLContentHandler; | ||
|
||
/** | ||
* Parser for the MP4 media container format, as well as the older | ||
* QuickTime format that MP4 is based on. | ||
* <p> | ||
* This uses Drew Noakes' metadata-extractor: https://github.com/drewnoakes/metadata-extractor | ||
*/ | ||
public class NoakesMP4Parser extends AbstractParser { | ||
/** | ||
* Serial version UID | ||
*/ | ||
private static final long serialVersionUID = 84011216792285L; | ||
private static final Map<MediaType, List<String>> typesMap = new HashMap<>(); | ||
private static final Set<MediaType> SUPPORTED_TYPES = | ||
Collections.unmodifiableSet(typesMap.keySet()); | ||
|
||
private static final MediaType APPLICATION_MP4 = MediaType.application("mp4"); | ||
private static final int MAX_ERROR_MESSAGES = 100; | ||
static { | ||
// All types should be 4 bytes long, space padded as needed | ||
typesMap.put(MediaType.audio("mp4"), Arrays.asList("M4A ", "M4B ", "F4A ", "F4B ")); | ||
typesMap.put(MediaType.video("3gpp"), | ||
Arrays.asList("3ge6", "3ge7", "3gg6", "3gp1", "3gp2", "3gp3", "3gp4", "3gp5", | ||
"3gp6", "3gs7")); | ||
typesMap.put(MediaType.video("3gpp2"), Arrays.asList("3g2a", "3g2b", "3g2c")); | ||
typesMap.put(MediaType.video("mp4"), Arrays.asList("mp41", "mp42")); | ||
typesMap.put(MediaType.video("x-m4v"), Arrays.asList("M4V ", "M4VH", "M4VP")); | ||
|
||
typesMap.put(MediaType.video("quicktime"), Collections.emptyList()); | ||
typesMap.put(MediaType.application("mp4"), Collections.emptyList()); | ||
} | ||
|
||
private ISO6709Extractor iso6709Extractor = new ISO6709Extractor(); | ||
|
||
public Set<MediaType> getSupportedTypes(ParseContext context) { | ||
return SUPPORTED_TYPES; | ||
} | ||
|
||
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, | ||
ParseContext context) throws IOException, SAXException, TikaException { | ||
|
||
TemporaryResources tmp = new TemporaryResources(); | ||
TikaInputStream tstream = TikaInputStream.get(stream, tmp); | ||
|
||
try (InputStream is = Files.newInputStream(tstream.getPath())) { | ||
|
||
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); | ||
xhtml.startDocument(); | ||
com.drew.metadata.Metadata mp4Metadata = new com.drew.metadata.Metadata(); | ||
Mp4BoxHandler boxHandler = new TikaMp4BoxHandler(mp4Metadata, metadata, xhtml); | ||
try { | ||
Mp4Reader.extract(is, boxHandler); | ||
} catch (RuntimeSAXException e) { | ||
throw (SAXException) e.getCause(); | ||
} | ||
//TODO -- figure out how to get IOExceptions out of boxhandler. Mp4Reader | ||
//currently swallows IOExceptions. | ||
Set<String> errorMessages = | ||
processMp4Directories( | ||
mp4Metadata.getDirectoriesOfType(Mp4Directory.class), | ||
metadata); | ||
|
||
for (String m : errorMessages) { | ||
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, m); | ||
} | ||
xhtml.endDocument(); | ||
} finally { | ||
tmp.dispose(); | ||
} | ||
} | ||
|
||
private Set<String> processMp4Directories(Collection<Mp4Directory> mp4Directories, | ||
Metadata metadata) { | ||
Set<String> errorMsgs = new HashSet<>(); | ||
for (Mp4Directory mp4Directory : mp4Directories) { | ||
for (String m : mp4Directory.getErrors()) { | ||
if (errorMsgs.size() < MAX_ERROR_MESSAGES) { | ||
errorMsgs.add(m); | ||
} else { | ||
break; | ||
} | ||
} | ||
/* for (Tag t : mp4Directory.getTags()) { | ||
System.out.println(mp4Directory.getClass() + " : " + t.getTagName() | ||
+ " : " + mp4Directory.getString(t.getTagType())); | ||
}*/ | ||
if (mp4Directory instanceof Mp4SoundDirectory) { | ||
processMp4SoundDirectory((Mp4SoundDirectory) mp4Directory, metadata); | ||
} else if (mp4Directory instanceof Mp4VideoDirectory) { | ||
processMp4VideoDirectory((Mp4VideoDirectory) mp4Directory, metadata); | ||
} else { | ||
processActualMp4Directory(mp4Directory, metadata); | ||
} | ||
} | ||
return errorMsgs; | ||
} | ||
|
||
private void processMp4VideoDirectory(Mp4VideoDirectory mp4Directory, Metadata metadata) { | ||
addInt(mp4Directory, metadata, Mp4VideoDirectory.TAG_HEIGHT, Metadata.IMAGE_LENGTH); | ||
addInt(mp4Directory, metadata, Mp4VideoDirectory.TAG_WIDTH, Metadata.IMAGE_WIDTH); | ||
if (mp4Directory.containsTag(Mp4VideoDirectory.TAG_COMPRESSOR_NAME)) { | ||
String compressor = mp4Directory.getString(Mp4VideoDirectory.TAG_COMPRESSOR_NAME); | ||
metadata.set(XMPDM.VIDEO_COMPRESSOR, compressor); | ||
} | ||
} | ||
|
||
private void processMp4SoundDirectory(Mp4SoundDirectory mp4SoundDirectory, | ||
Metadata metadata) { | ||
addInt(mp4SoundDirectory, metadata, Mp4SoundDirectory.TAG_AUDIO_SAMPLE_RATE, | ||
XMPDM.AUDIO_SAMPLE_RATE); | ||
|
||
try { | ||
int numChannels = mp4SoundDirectory.getInt(Mp4SoundDirectory.TAG_NUMBER_OF_CHANNELS); | ||
|
||
if (numChannels == 1) { | ||
metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Mono"); | ||
} else if (numChannels == 2) { | ||
metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Stereo"); | ||
} else { | ||
//??? log | ||
} | ||
|
||
} catch (MetadataException e) { | ||
//log | ||
} | ||
} | ||
|
||
private void addInt(Mp4Directory mp4Directory, Metadata metadata, int tag, | ||
Property property) { | ||
try { | ||
int val = mp4Directory.getInt(tag); | ||
metadata.set(property, val); | ||
} catch (MetadataException e) { | ||
//log | ||
} | ||
} | ||
|
||
private void processActualMp4Directory(Mp4Directory mp4Directory, Metadata metadata) { | ||
addDate(mp4Directory, metadata, Mp4Directory.TAG_CREATION_TIME, TikaCoreProperties.CREATED); | ||
addDate(mp4Directory, metadata, Mp4Directory.TAG_MODIFICATION_TIME, | ||
TikaCoreProperties.MODIFIED); | ||
handleBrands(mp4Directory, metadata); | ||
handleDurationInSeconds(mp4Directory, metadata); | ||
|
||
addDouble(mp4Directory, metadata, Mp4Directory.TAG_LATITUDE, TikaCoreProperties.LATITUDE); | ||
addDouble(mp4Directory, metadata, Mp4Directory.TAG_LONGITUDE, TikaCoreProperties.LONGITUDE); | ||
addInt(mp4Directory, metadata, Mp4Directory.TAG_TIME_SCALE, XMPDM.AUDIO_SAMPLE_RATE); | ||
} | ||
|
||
private void handleDurationInSeconds(Mp4Directory mp4Directory, Metadata metadata) { | ||
String durationInSeconds = mp4Directory.getString(Mp4Directory.TAG_DURATION_SECONDS); | ||
if (durationInSeconds == null) { | ||
return; | ||
} | ||
if (! durationInSeconds.contains("/")) { | ||
try { | ||
double d = Double.parseDouble(durationInSeconds); | ||
DecimalFormat df = | ||
(DecimalFormat) NumberFormat.getNumberInstance(Locale.ROOT); | ||
df.applyPattern("0.0#"); | ||
metadata.set(XMPDM.DURATION, df.format(d)); | ||
} catch (NumberFormatException e) { | ||
//swallow | ||
} | ||
return; | ||
} | ||
String[] bits = durationInSeconds.split("/"); | ||
if (bits.length != 2) { | ||
return; | ||
} | ||
double durationSeconds; | ||
try { | ||
long numerator = Long.parseLong(bits[0]); | ||
long denominator = Long.parseLong(bits[1]); | ||
if (denominator != 0) { | ||
durationSeconds = (double) numerator / (double) denominator; | ||
// Get the duration | ||
//TODO Replace this with a 2dp Duration Property Converter | ||
//avoid thread safety issues by creating a new decimal format for every call | ||
//threadlocal doesn't play well in long running processes. | ||
DecimalFormat df = | ||
(DecimalFormat) NumberFormat.getNumberInstance(Locale.ROOT); | ||
df.applyPattern("0.0#"); | ||
metadata.set(XMPDM.DURATION, df.format(durationSeconds)); | ||
} | ||
} catch (NumberFormatException e) { | ||
//log | ||
return; | ||
} | ||
} | ||
|
||
private void handleBrands(Mp4Directory mp4Directory, Metadata metadata) { | ||
|
||
|
||
String majorBrand = mp4Directory.getString(Mp4Directory.TAG_MAJOR_BRAND); | ||
// Identify the type based on the major brand | ||
Optional<MediaType> typeHolder = typesMap.entrySet().stream() | ||
.filter(e -> e.getValue().contains(majorBrand)).findFirst() | ||
.map(Map.Entry::getKey); | ||
|
||
if (!typeHolder.isPresent()) { | ||
String compatibleBrands = | ||
mp4Directory.getString(Mp4Directory.TAG_COMPATIBLE_BRANDS); | ||
if (compatibleBrands != null) { | ||
// If no match for major brand, see if any of the compatible brands match | ||
typeHolder = typesMap.entrySet().stream().filter(e -> | ||
e.getValue().stream().anyMatch(compatibleBrands::contains)) | ||
.findFirst().map(Map.Entry::getKey); | ||
} | ||
} | ||
MediaType type = typeHolder.orElse(MediaType.application("mp4")); | ||
if (metadata.getValues(Metadata.CONTENT_TYPE) == null) { | ||
metadata.set(Metadata.CONTENT_TYPE, type.toString()); | ||
} else if (! type.equals(APPLICATION_MP4)) { //todo check for specialization? | ||
metadata.set(Metadata.CONTENT_TYPE, type.toString()); | ||
} | ||
if (type.getType().equals("audio") && ! StringUtils.isBlank(majorBrand)) { | ||
metadata.set(XMPDM.AUDIO_COMPRESSOR, majorBrand.trim()); | ||
} | ||
|
||
} | ||
|
||
private void addDate(Mp4Directory mp4Directory, Metadata metadata, int tag, | ||
Property property) { | ||
Date d = mp4Directory.getDate(tag); | ||
if (d == null) { | ||
return; | ||
} | ||
metadata.set(property, d); | ||
|
||
} | ||
|
||
private void addDouble(Directory mp4Directory, Metadata metadata, int tag, | ||
Property property) { | ||
try { | ||
double val = mp4Directory.getDouble(tag); | ||
metadata.set(property, val); | ||
} catch (MetadataException e) { | ||
//log | ||
return; | ||
} | ||
|
||
} | ||
} |
79 changes: 79 additions & 0 deletions
79
tika-parsers/src/main/java/org/apache/tika/parser/mp4/TikaMp4BoxHandler.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.tika.parser.mp4; | ||
|
||
import java.io.IOException; | ||
|
||
import com.drew.imaging.mp4.Mp4Handler; | ||
import com.drew.lang.annotations.NotNull; | ||
import com.drew.lang.annotations.Nullable; | ||
import com.drew.metadata.Metadata; | ||
import com.drew.metadata.mp4.Mp4BoxHandler; | ||
import com.drew.metadata.mp4.Mp4Context; | ||
import com.drew.metadata.mp4.boxes.Box; | ||
import org.xml.sax.SAXException; | ||
|
||
import org.apache.tika.parser.mp4.boxes.TikaUserDataBox; | ||
import org.apache.tika.sax.XHTMLContentHandler; | ||
|
||
public class TikaMp4BoxHandler extends Mp4BoxHandler { | ||
|
||
org.apache.tika.metadata.Metadata tikaMetadata; | ||
final XHTMLContentHandler xhtml; | ||
public TikaMp4BoxHandler(Metadata metadata, org.apache.tika.metadata.Metadata tikaMetadata, | ||
XHTMLContentHandler xhtml) { | ||
super(metadata); | ||
this.tikaMetadata = tikaMetadata; | ||
this.xhtml = xhtml; | ||
} | ||
|
||
@Override | ||
public boolean shouldAcceptBox(@NotNull Box box) { | ||
if (box.type.equals("udta")) { | ||
return true; | ||
} | ||
return super.shouldAcceptBox(box); | ||
} | ||
|
||
@Override | ||
public boolean shouldAcceptContainer(@NotNull Box box) { | ||
return super.shouldAcceptContainer(box); | ||
} | ||
|
||
@Override | ||
public Mp4Handler<?> processBox(@NotNull Box box, @Nullable byte[] payload, Mp4Context context) | ||
throws IOException { | ||
if (box.type.equals("udta")) { | ||
return processUserData(box, payload, context); | ||
} | ||
|
||
return super.processBox(box, payload, context); | ||
} | ||
|
||
|
||
private Mp4Handler<?> processUserData(Box box, byte[] payload, Mp4Context context) throws IOException { | ||
if (payload == null) { | ||
return this; | ||
} | ||
try { | ||
new TikaUserDataBox(box, payload, tikaMetadata, xhtml).addMetadata(directory); | ||
} catch (SAXException e) { | ||
throw new IOException(e); | ||
} | ||
return this; | ||
} | ||
} |
Oops, something went wrong.