diff --git a/languages/llvmir/README.md b/languages/llvmir/README.md index 6d5af395d..a006a51d2 100644 --- a/languages/llvmir/README.md +++ b/languages/llvmir/README.md @@ -22,6 +22,20 @@ These include binary and bitwise instructions (like addition and or), memory ope To use the LLVM IR module, add the `-l llvmir` flag in the CLI, or use a `JPlagOption` object with `new de.jplag.llvmir.LLVMIRLanguage()` as `language` in the Java API as described in the usage information in the [readme of the main project](https://github.com/jplag/JPlag#usage) and [in the wiki](https://github.com/jplag/JPlag/wiki/1.-How-to-Use-JPlag). +We recommend using the [LLVM optimizer](https://llvm.org/docs/CommandGuide/opt.html) to optimize the LLVM IR code before using JPlag. +In our tests, optimization level 1 showed the best results in plagiarism detection quality and should therefore, be used. + +### Minimum Token Match + +It can be difficult to find a good value for the minimum token match because the range of possible candidates for low-level languages like the LLVM IR is much larger. +Values can range between 60 and 70 for code compiled from C to more than 1000 for code compiled from C++. +From our tests, we calculated a formula that depends on the average lines of code (avg. loc) to determine a value that should provide good results: + +min_token_match(x) = 48.2055162 * e^(0.000333593799 * x) + +with x = (avg. loc of the LLVM IR code) - (avg. loc of the source code),
+where the source code is the code from which the IR code was generated, for example, the C or C++ code. +
#### Footnotes diff --git a/languages/llvmir/src/main/java/de/jplag/llvmir/LLVMIRLanguage.java b/languages/llvmir/src/main/java/de/jplag/llvmir/LLVMIRLanguage.java index 8f0d65830..846a047e6 100644 --- a/languages/llvmir/src/main/java/de/jplag/llvmir/LLVMIRLanguage.java +++ b/languages/llvmir/src/main/java/de/jplag/llvmir/LLVMIRLanguage.java @@ -13,7 +13,7 @@ public class LLVMIRLanguage extends AbstractAntlrLanguage { private static final String NAME = "LLVMIR Parser"; private static final String IDENTIFIER = "llvmir"; - private static final int DEFAULT_MIN_TOKEN_MATCH = 40; + private static final int DEFAULT_MIN_TOKEN_MATCH = 70; private static final String[] FILE_EXTENSIONS = {".ll"}; public LLVMIRLanguage() {