Skip to content

Commit

Permalink
Merge branch 'main' into feature/2696-Document-level-recommendations
Browse files Browse the repository at this point in the history
* main:
  #4370 - Download knowledge bases in compressed form
  #4369 - Display size of knowledge bases
  #4360 - Clean up code
  #4360 - Clean up code
  #4326 - Upgrade dependencies
  • Loading branch information
reckart committed Dec 10, 2023
2 parents d5e6d3b + 3a52674 commit 83210d4
Show file tree
Hide file tree
Showing 16 changed files with 483 additions and 289 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,29 +19,26 @@

import static de.tudarmstadt.ukp.inception.project.api.ProjectService.DOCUMENT_FOLDER;
import static de.tudarmstadt.ukp.inception.project.api.ProjectService.PROJECT_FOLDER;
import static de.tudarmstadt.ukp.inception.project.api.ProjectService.SOURCE_FOLDER;
import static de.tudarmstadt.ukp.inception.support.io.FastIOUtils.copy;
import static java.lang.System.currentTimeMillis;
import static java.nio.file.Files.createDirectory;
import static java.nio.file.Files.createDirectories;
import static java.util.function.Function.identity;
import static java.util.stream.Collectors.toMap;
import static org.apache.commons.io.FileUtils.copyFileToDirectory;
import static org.apache.commons.io.FileUtils.forceMkdir;
import static org.apache.commons.lang3.time.DurationFormatUtils.formatDurationWords;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.lang.invoke.MethodHandles;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.apache.commons.lang3.time.DurationFormatUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -57,6 +54,7 @@
import de.tudarmstadt.ukp.inception.documents.api.DocumentService;
import de.tudarmstadt.ukp.inception.documents.api.RepositoryProperties;
import de.tudarmstadt.ukp.inception.documents.config.DocumentServiceAutoConfiguration;
import de.tudarmstadt.ukp.inception.project.api.ProjectService;
import de.tudarmstadt.ukp.inception.support.logging.LogMessage;

/**
Expand All @@ -68,9 +66,7 @@
public class SourceDocumentExporter
implements ProjectExporter
{
private static final String SOURCE_FOLDER = "source";

private final Logger log = LoggerFactory.getLogger(getClass());
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

private final DocumentService documentService;
private final RepositoryProperties repositoryProperties;
Expand All @@ -93,11 +89,11 @@ public void exportData(FullProjectExportRequest aRequest, ProjectExportTaskMonit

private void exportSourceDocuments(Project aProject, ExportedProject exProject)
{
List<ExportedSourceDocument> sourceDocuments = new ArrayList<>();
var sourceDocuments = new ArrayList<ExportedSourceDocument>();

// add source documents to a project
List<SourceDocument> documents = documentService.listSourceDocuments(aProject);
for (SourceDocument sourceDocument : documents) {
var documents = documentService.listSourceDocuments(aProject);
for (var sourceDocument : documents) {
ExportedSourceDocument exDocument = new ExportedSourceDocument();
exDocument.setFormat(sourceDocument.getFormat());
exDocument.setName(sourceDocument.getName());
Expand All @@ -116,28 +112,28 @@ private void exportSourceDocumentContents(FullProjectExportRequest aRequest,
ProjectExportTaskMonitor aMonitor, ExportedProject aExProject, File aStage)
throws IOException, ProjectExportException, InterruptedException
{
Project project = aRequest.getProject();
File sourceDocumentDir = new File(aStage, SOURCE_FOLDER);
FileUtils.forceMkdir(sourceDocumentDir);
var project = aRequest.getProject();
var sourceDocumentDir = new File(aStage, SOURCE_FOLDER);
forceMkdir(sourceDocumentDir);
// Get all the source documents from the project
List<SourceDocument> documents = documentService.listSourceDocuments(project);
var documents = documentService.listSourceDocuments(project);
int i = 1;
for (SourceDocument sourceDocument : documents) {
for (var sourceDocument : documents) {
// check if the export has been cancelled
if (Thread.interrupted()) {
throw new InterruptedException();
}

try {
FileUtils.copyFileToDirectory(documentService.getSourceDocumentFile(sourceDocument),
sourceDocumentDir);
var documentFile = documentService.getSourceDocumentFile(sourceDocument);
copyFileToDirectory(documentFile, sourceDocumentDir);
aMonitor.setProgress((int) Math.ceil(((double) i) / documents.size() * 10.0));
log.info("Exported content for source document {}/{}: {} in {}", i,
LOG.info("Exported content for source document {}/{}: {} in {}", i,
documents.size(), sourceDocument, project);
i++;
}
catch (FileNotFoundException e) {
log.error("Source file [{}] related to project couldn't be located in repository",
LOG.error("Source file [{}] related to project couldn't be located in repository",
sourceDocument.getName(), ExceptionUtils.getRootCause(e));
aMonitor.addMessage(LogMessage.error(this,
"Source file [%s] related to project couldn't be located in repository",
Expand All @@ -153,14 +149,14 @@ public void importData(ProjectImportRequest aRequest, Project aProject,
ExportedProject aExProject, ZipFile aZip)
throws Exception
{
long start = currentTimeMillis();
var start = currentTimeMillis();

importSourceDocuments(aExProject, aProject);
importSourceDocumentContents(aZip, aProject);

log.info("Imported [{}] source documents into aProject ({})",
LOG.info("Imported [{}] source documents into aProject ({})",
aExProject.getSourceDocuments().size(), aProject,
DurationFormatUtils.formatDurationWords(currentTimeMillis() - start, true, true));
formatDurationWords(currentTimeMillis() - start, true, true));
}

/**
Expand All @@ -177,9 +173,8 @@ private void importSourceDocuments(ExportedProject aImportedProjectSetting,
Project aImportedProject)
throws IOException
{
for (ExportedSourceDocument importedSourceDocument : aImportedProjectSetting
.getSourceDocuments()) {
SourceDocument sourceDocument = new SourceDocument();
for (var importedSourceDocument : aImportedProjectSetting.getSourceDocuments()) {
var sourceDocument = new SourceDocument();
sourceDocument.setFormat(importedSourceDocument.getFormat());
sourceDocument.setName(importedSourceDocument.getName());
sourceDocument.setState(importedSourceDocument.getState());
Expand All @@ -202,44 +197,46 @@ private void importSourceDocuments(ExportedProject aImportedProjectSetting,
* @throws IOException
* if an I/O error occurs.
*/
@SuppressWarnings("rawtypes")
private void importSourceDocumentContents(ZipFile zip, Project aProject) throws IOException
{
// Query once for all the documents to avoid hitting the DB in the loop below
Map<String, SourceDocument> docs = documentService.listSourceDocuments(aProject).stream()
.collect(Collectors.toMap(SourceDocument::getName, identity()));
var docs = documentService.listSourceDocuments(aProject).stream()
.collect(toMap(SourceDocument::getName, identity()));

// Create the folder structure for the project. This saves time over waiting for the
// mkdirs in FastIOUtils.copy to kick in.
Path docRoot = Paths.get(repositoryProperties.getPath().getAbsolutePath(), PROJECT_FOLDER,
var docRoot = Paths.get(repositoryProperties.getPath().getAbsolutePath(), PROJECT_FOLDER,
aProject.getId().toString(), DOCUMENT_FOLDER);
Files.createDirectories(docRoot);
for (SourceDocument doc : docs.values()) {
Path docFolder = docRoot.resolve(doc.getId().toString());
createDirectory(docFolder);
Path sourceDocFolder = docFolder.resolve(SOURCE_FOLDER);
createDirectory(sourceDocFolder);
createDirectories(docRoot);

for (var doc : docs.values()) {
createDirectories(
docRoot.resolve(doc.getId().toString()).resolve(ProjectService.SOURCE_FOLDER));
}

int n = 0;
for (Enumeration zipEnumerate = zip.entries(); zipEnumerate.hasMoreElements();) {
ZipEntry entry = (ZipEntry) zipEnumerate.nextElement();
for (var entries = zip.entries(); entries.hasMoreElements();) {
var entry = entries.nextElement();

if (entry.isDirectory()) {
continue;
}

// Strip leading "/" that we had in ZIP files prior to 2.0.8 (bug #985)
String entryName = ProjectExporter.normalizeEntryName(entry);
var entryName = ProjectExporter.normalizeEntryName(entry);

if (entryName.startsWith(SOURCE_FOLDER)) {
String fileName = FilenameUtils.getName(entryName);
if (entryName.startsWith(ProjectService.SOURCE_FOLDER)) {
var fileName = FilenameUtils.getName(entryName);
if (fileName.trim().isEmpty()) {
continue;
}

SourceDocument sourceDocument = docs.get(fileName);
File sourceFilePath = documentService.getSourceDocumentFile(sourceDocument);
var sourceDocument = docs.get(fileName);
var sourceFilePath = documentService.getSourceDocumentFile(sourceDocument);
copy(zip.getInputStream(entry), sourceFilePath);

n++;
log.info("Imported content for source document {}/{}: {} in {}", n, docs.size(),
LOG.info("Imported content for source document {}/{}: {} in {}", n, docs.size(),
sourceDocument, aProject);
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.inception.documents.exporters;

import static de.tudarmstadt.ukp.inception.project.api.ProjectService.DOCUMENT_FOLDER;
import static de.tudarmstadt.ukp.inception.project.api.ProjectService.PROJECT_FOLDER;
import static de.tudarmstadt.ukp.inception.project.api.ProjectService.SOURCE_FOLDER;
import static java.util.Arrays.asList;
import static org.apache.commons.io.FileUtils.listFiles;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;

import java.io.File;
import java.nio.file.Files;
import java.util.List;
import java.util.zip.ZipFile;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.junit.jupiter.api.io.TempDir;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;

import de.tudarmstadt.ukp.clarin.webanno.api.export.FullProjectExportRequest;
import de.tudarmstadt.ukp.clarin.webanno.api.export.ProjectExportTaskMonitor;
import de.tudarmstadt.ukp.clarin.webanno.api.export.ProjectImportRequest;
import de.tudarmstadt.ukp.clarin.webanno.export.model.ExportedProject;
import de.tudarmstadt.ukp.clarin.webanno.model.Project;
import de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument;
import de.tudarmstadt.ukp.inception.documents.api.DocumentService;
import de.tudarmstadt.ukp.inception.documents.api.RepositoryProperties;
import de.tudarmstadt.ukp.inception.support.io.ZipUtils;

@ExtendWith(MockitoExtension.class)
public class SourceDocumentExporterTest
{
private RepositoryProperties repositoryProperties;

private @Mock DocumentService documentService;

private Project project;

private SourceDocumentExporter sut;

@BeforeEach
public void setUp() throws Exception
{
project = Project.builder() //
.withId(1l) //
.withName("Test Project") //
.build();

repositoryProperties = new RepositoryProperties();

when(documentService.listSourceDocuments(any())).then(invocation -> {
return sourceDocuments();
});

when(documentService.getSourceDocumentFile(any())).then(invocation -> {
var doc = invocation.getArgument(0, SourceDocument.class);
return repositoryProperties.getPath().toPath() //
.resolve(PROJECT_FOLDER) //
.resolve(String.valueOf(doc.getProject().getId())) //
.resolve(DOCUMENT_FOLDER) //
.resolve(String.valueOf(doc.getId())) //
.resolve(SOURCE_FOLDER) //
.resolve(doc.getName()) //
.toFile();
});

sut = new SourceDocumentExporter(documentService, repositoryProperties);
}

@Test
public void thatExportingAndImportingWorks(@TempDir File sourceWorkDir,
@TempDir File targetWorkDir, @TempDir File stage)
throws Exception
{
repositoryProperties.setPath(sourceWorkDir);

// Prepare some source files
for (var doc : sourceDocuments()) {
var file = documentService.getSourceDocumentFile(doc).toPath();
Files.createDirectories(file.getParent());
Files.writeString(file, doc.getName());
}

// Export the source files
var exportRequest = new FullProjectExportRequest(project, null, false);
var monitor = mock(ProjectExportTaskMonitor.class);
var exProject = new ExportedProject();
sut.exportData(exportRequest, monitor, exProject, stage);

var zipFile = File.createTempFile("test", ".zip");
ZipUtils.zipFolder(stage, zipFile);

// Import the project again
repositoryProperties.setPath(targetWorkDir);
var importRequest = new ProjectImportRequest(true);
sut.importData(importRequest, project, exProject, new ZipFile(zipFile));

var sourceFiles = listFiles(sourceWorkDir, null, true).stream()
.map(f -> sourceWorkDir.toPath().relativize(f.toPath())).toList();
var targetFiles = listFiles(targetWorkDir, null, true).stream()
.map(f -> targetWorkDir.toPath().relativize(f.toPath())).toList();

assertThat(targetFiles) //
.isNotEmpty() //
.hasSameSizeAs(sourceDocuments()) //
.containsExactlyInAnyOrderElementsOf(sourceFiles);
}

private List<SourceDocument> sourceDocuments()
{
return asList(
SourceDocument.builder().withId(1l).withProject(project).withName("1.txt").build(),
SourceDocument.builder().withId(2l).withProject(project).withName("2.txt").build());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,11 @@ void importData(ProjectImportRequest aRequest, Project aProject, ExportedProject
static String normalizeEntryName(ZipEntry aEntry)
{
// Strip leading "/" that we had in ZIP files prior to 2.0.8 (bug #985)
String entryName = aEntry.toString();
var entryName = aEntry.toString();
if (entryName.startsWith("/")) {
entryName = entryName.substring(1);
}

return entryName;
}

}
4 changes: 0 additions & 4 deletions inception/inception-kb/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,6 @@
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-model</artifactId>
</dependency>
<dependency>
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-common-iterator</artifactId>
</dependency>
<dependency>
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-repository-api</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ List<KBHandle> listInstances(KnowledgeBase kb, String aConceptIri, boolean aAll)
* The knowledge base to query
* @param aIdentifier
* The identifier of the entity
* @return All statements that match the specification
* @return All statements that match the specificat@Override ion
*/
List<Statement> listStatementsWithPredicateOrObjectReference(KnowledgeBase kb,
String aIdentifier);
Expand Down Expand Up @@ -629,4 +629,10 @@ Optional<KBConcept> readConcept(KnowledgeBase aKB, String aIdentifier, boolean a
List<KBHandle> listHandlesCaching(KnowledgeBase aKB, SPARQLQuery aQuery, boolean aAll);

Optional<KBHandle> fetchHandleCaching(KnowledgeBase aKB, SPARQLQuery aQuery, boolean aAll);

long getRepositorySize(KnowledgeBase aKB);

long getStatementCount(KnowledgeBase aKB);

long getIndexSize(KnowledgeBase aKB);
}
Loading

0 comments on commit 83210d4

Please sign in to comment.