From a8aac8d7b9667571dbf762bdfa9086a56e98ae5c Mon Sep 17 00:00:00 2001 From: Olcay Taner YILDIZ Date: Mon, 14 Sep 2020 23:20:38 +0300 Subject: [PATCH] Started repository translation. --- .../contents.xcworkspacedata | 2 +- Package.swift | 5 +- Sources/WordToVec/Iteration.swift | 124 +++++++++ Sources/WordToVec/NeuralNetwork.swift | 239 ++++++++++++++++++ Sources/WordToVec/Vocabulary.swift | 186 ++++++++++++++ Sources/WordToVec/VocabularyWord.swift | 113 +++++++++ Sources/WordToVec/WordToVec.swift | 3 - Sources/WordToVec/WordToVecParameter.swift | 154 +++++++++++ Tests/WordToVecTests/NeuralNetworkTest.swift | 44 ++++ 9 files changed, 863 insertions(+), 7 deletions(-) create mode 100644 Sources/WordToVec/Iteration.swift create mode 100644 Sources/WordToVec/NeuralNetwork.swift create mode 100644 Sources/WordToVec/Vocabulary.swift create mode 100644 Sources/WordToVec/VocabularyWord.swift delete mode 100644 Sources/WordToVec/WordToVec.swift create mode 100644 Sources/WordToVec/WordToVecParameter.swift create mode 100644 Tests/WordToVecTests/NeuralNetworkTest.swift diff --git a/.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata b/.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata index 706eede..919434a 100644 --- a/.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata +++ b/.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata @@ -2,6 +2,6 @@ + location = "self:"> diff --git a/Package.swift b/Package.swift index a49b218..0de7ab0 100644 --- a/Package.swift +++ b/Package.swift @@ -12,15 +12,14 @@ let package = Package( targets: ["WordToVec"]), ], dependencies: [ - // Dependencies declare other packages that this package depends on. - // .package(url: /* package url */, from: "1.0.0"), + .package(name: "Corpus", url: "https://github.com/StarlangSoftware/Corpus-Swift.git", .exact("1.0.2")), ], targets: [ // Targets are the basic building blocks of a package. A target can define a module or a test suite. // Targets can depend on other targets in this package, and on products in packages which this package depends on. .target( name: "WordToVec", - dependencies: []), + dependencies: ["Corpus"]), .testTarget( name: "WordToVecTests", dependencies: ["WordToVec"]), diff --git a/Sources/WordToVec/Iteration.swift b/Sources/WordToVec/Iteration.swift new file mode 100644 index 0000000..f8bbe69 --- /dev/null +++ b/Sources/WordToVec/Iteration.swift @@ -0,0 +1,124 @@ +// +// File.swift +// +// +// Created by Olcay Taner YILDIZ on 14.09.2020. +// + +import Foundation +import Corpus + +public class Iteration{ + + private var __wordCount: Int = 0 + private var __lastWordCount: Int = 0 + private var __wordCountActual: Int = 0 + private var __iterationCount: Int = 0 + private var __sentencePosition: Int = 0 + private var __sentenceIndex: Int = 0 + private var __startingAlpha: Double + private var __alpha: Double + private var __corpus: Corpus + private var __wordToVecParameter: WordToVecParameter + + /** + Constructor for the Iteration class. Get corpus and parameter as input, sets the corresponding + parameters. + + - Parameters: + - corpus : Corpus used to train word vectors using Word2Vec algorithm. + - wordToVecParameter : Parameters of the Word2Vec algorithm. + */ + public init(corpus: Corpus, wordToVecParameter: WordToVecParameter){ + self.__corpus = corpus + self.__wordToVecParameter = wordToVecParameter + self.__startingAlpha = wordToVecParameter.getAlpha() + self.__alpha = wordToVecParameter.getAlpha() + } + + /** + Accessor for the alpha attribute. + + - Returns: Alpha attribute. + */ + public func getAlpha() -> Double{ + return self.__alpha + } + + /** + Accessor for the iterationCount attribute. + + - Returns: IterationCount attribute. + */ + public func getIterationCount() -> Int{ + return self.__iterationCount + } + + /** + Accessor for the sentenceIndex attribute. + + - Returns: SentenceIndex attribute + */ + public func getSentenceIndex() -> Int{ + return self.__sentenceIndex + } + + /** + Accessor for the sentencePosition attribute. + + - Returns: SentencePosition attribute + */ + public func getSentencePosition() -> Int{ + return self.__sentencePosition + } + + /** + Updates the alpha parameter after 10000 words has been processed. + */ + public func alphaUpdate(){ + if self.__wordCount - self.__lastWordCount > 10000{ + self.__wordCountActual += self.__wordCount - self.__lastWordCount + self.__lastWordCount = self.__wordCount + self.__alpha = self.__startingAlpha * (1.0 - Double(self.__wordCountActual) / + (Double(self.__wordToVecParameter.getNumberOfIterations()) * + Double(self.__corpus.numberOfWords()) + 1.0)) + if self.__alpha < self.__startingAlpha * 0.0001{ + self.__alpha = self.__startingAlpha * 0.0001 + } + } + } + + /** + Updates sentencePosition, sentenceIndex (if needed) and returns the current sentence processed. If one sentence + is finished, the position shows the beginning of the next sentence and sentenceIndex is incremented. If the + current sentence is the last sentence, the system shuffles the sentences and returns the first sentence. + + PARAMETERS + ---------- + currentSentence : Sentence + Current sentence processed. + + RETURNS + ------- + Sentence + If current sentence is not changed, currentSentence; if changed the next sentence; if next sentence is + the last sentence; shuffles the corpus and returns the first sentence. + */ + public func sentenceUpdate(currentSentence: Sentence) -> Sentence{ + self.__sentencePosition = self.__sentencePosition + 1 + if self.__sentencePosition >= currentSentence.wordCount(){ + self.__wordCount += currentSentence.wordCount() + self.__sentenceIndex = self.__sentenceIndex + 1 + self.__sentencePosition = 0 + if self.__sentenceIndex == self.__corpus.sentenceCount(){ + self.__iterationCount = self.__iterationCount + 1 + self.__wordCount = 0 + self.__lastWordCount = 0 + self.__sentenceIndex = 0 + self.__corpus.shuffleSentences(seed: 1) + } + return self.__corpus.getSentence(index: self.__sentenceIndex) + } + return currentSentence + } +} diff --git a/Sources/WordToVec/NeuralNetwork.swift b/Sources/WordToVec/NeuralNetwork.swift new file mode 100644 index 0000000..eb8ff47 --- /dev/null +++ b/Sources/WordToVec/NeuralNetwork.swift @@ -0,0 +1,239 @@ +// +// File.swift +// +// +// Created by Olcay Taner YILDIZ on 14.09.2020. +// + +import Foundation +import Math +import Corpus +import Dictionary + +class NeuralNetwork{ + + private var __wordVectors: Matrix + private var __wordVectorUpdate: Matrix + private var __vocabulary: Vocabulary + private var __parameter: WordToVecParameter + private var __corpus: Corpus + private var __expTable: [Double] = [] + + private static var EXP_TABLE_SIZE = 1000 + private static var MAX_EXP = 6 + + /** + Constructor for the NeuralNetwork class. Gets corpus and network parameters as input and sets the + corresponding parameters first. After that, initializes the network with random weights between -0.5 and 0.5. + Constructs vector update matrix and prepares the exp table. + + - Parameters: + - corpus : Corpus used to train word vectors using Word2Vec algorithm. + - parameter : Parameters of the Word2Vec algorithm. + */ + public init(corpus: Corpus, parameter: WordToVecParameter){ + self.__vocabulary = Vocabulary(corpus: corpus) + self.__parameter = parameter + self.__corpus = corpus + self.__wordVectors = Matrix(row: self.__vocabulary.size(), col: self.__parameter.getLayerSize(), min: -0.5, max: 0.5) + self.__wordVectorUpdate = Matrix(row: self.__vocabulary.size(), col: self.__parameter.getLayerSize()) + self.__prepareExpTable() + } + + /** + Constructs the fast exponentiation table. Instead of taking exponent at each time, the algorithm will lookup + the table. + */ + public func __prepareExpTable(){ + self.__expTable = Array(repeating: 0.0, count: NeuralNetwork.EXP_TABLE_SIZE + 1) + for i in 0.. VectorizedDictionary{ + let result : VectorizedDictionary = VectorizedDictionary() + if self.__parameter.isCbow(){ + self.__trainCbow() + } else { + self.__trainSkipGram() + } + for i in 0.. Double{ + if f > Double(NeuralNetwork.MAX_EXP){ + return (label - 1) * alpha + } else if f < -Double(NeuralNetwork.MAX_EXP){ + return label * alpha + } else { + return (label - self.__expTable[Int((f + Double(NeuralNetwork.MAX_EXP)) * + Double(NeuralNetwork.EXP_TABLE_SIZE / NeuralNetwork.MAX_EXP / 2))]) * alpha + } + } + + /** + Main method for training the CBow version of Word2Vec algorithm. + */ + public func __trainCbow(){ + let iteration = Iteration(corpus: self.__corpus, wordToVecParameter: self.__parameter) + var currentSentence : Sentence = self.__corpus.getSentence(index: iteration.getSentenceIndex()) + let outputs = Vector(size: self.__parameter.getLayerSize(), x: 0.0) + let outputUpdate = Vector(size: self.__parameter.getLayerSize(), x: 0) + self.__corpus.shuffleSentences(seed: 1) + while iteration.getIterationCount() < self.__parameter.getNumberOfIterations(){ + iteration.alphaUpdate() + let wordIndex = self.__vocabulary.getPosition(word: currentSentence.getWord(index: iteration.getSentencePosition())) + let currentWord = self.__vocabulary.getWord(index: wordIndex) + outputs.clear() + outputUpdate.clear() + let b = Int.random(in: 0.. 0{ + outputs.divide(value: Double(cw)) + if self.__parameter.isHierarchicalSoftMax(){ + for d in 0..= Double(NeuralNetwork.MAX_EXP){ + continue + } else { + f = self.__expTable[Int((f + Double(NeuralNetwork.MAX_EXP)) * + Double(NeuralNetwork.EXP_TABLE_SIZE / NeuralNetwork.MAX_EXP / 2))] + } + let g = (1.0 - Double(currentWord.getCode(index: d)) - f) * iteration.getAlpha() + outputUpdate.addVector(v: self.__wordVectorUpdate.getRowVector(row: l2).product(value: g)) + self.__wordVectorUpdate.addRowVector(rowNo: l2, v: outputs.product(value: g)) + } + } else { + var target : Int + var label : Int + for d in 0..= Double(NeuralNetwork.MAX_EXP){ + continue + } else { + f = self.__expTable[Int((f + Double(NeuralNetwork.MAX_EXP)) * + Double(NeuralNetwork.EXP_TABLE_SIZE / NeuralNetwork.MAX_EXP / 2))] + } + let g = (1.0 - Double(currentWord.getCode(index: d)) - f) * iteration.getAlpha() + outputUpdate.addVector(v: self.__wordVectorUpdate.getRowVector(row: l2).product(value: g)) + self.__wordVectorUpdate.addRowVector(rowNo: l2, v: self.__wordVectors.getRowVector(row: l1).product(value: g)) + } + } else { + var target : Int + var label : Int + for d in 0.. Int{ + return self.__vocabulary.count + } + + /** + Searches a word and returns the position of that word in the vocabulary. Search is done using binary search. + + - Parameter word : Word to be searched. + + - Returns: Position of the word searched. + */ + public func getPosition(word: Word) -> Int{ + var lo : Int = 0 + var hi : Int = self.__vocabulary.count + while lo < hi{ + let mid : Int = (lo + hi) / 2 + if self.__vocabulary[mid].getName() < word.getName(){ + lo = mid + 1 + } else { + hi = mid + } + } + return lo + } + + /** + Returns the word at a given index. + + - Parameter index : Index of the word. + + - Returns: The word at a given index. + */ + public func getWord(index: Int) -> VocabularyWord{ + return self.__vocabulary[index] + } + + /** + Constructs Huffman Tree based on the number of occurences of the words. + */ + public func __constructHuffmanTree(){ + var count : [Int] = Array(repeating: 0, count: self.__vocabulary.count * 2 + 1) + var code : [Int] = Array(repeating: 0, count: VocabularyWord.MAX_CODE_LENGTH) + var point : [Int] = Array(repeating: 0, count: VocabularyWord.MAX_CODE_LENGTH) + var binary : [Int] = Array(repeating: 0, count: self.__vocabulary.count * 2 + 1) + var parentNode : [Int] = Array(repeating: 0, count: self.__vocabulary.count * 2 + 1) + for a in 0..= 0{ + if count[pos1] < count[pos2] { + min1i = pos1 + pos1 = pos1 - 1 + } else { + min1i = pos2 + pos2 = pos2 + 1 + } + } else { + min1i = pos2 + pos2 = pos2 + 1 + } + if pos1 >= 0{ + if count[pos1] < count[pos2]{ + min2i = pos1 + pos1 = pos1 - 1 + } else { + min2i = pos2 + pos2 = pos2 + 1 + } + } else { + min2i = pos2 + pos2 = pos2 + 1 + } + count[self.__vocabulary.count + a] = count[min1i] + count[min2i] + parentNode[min1i] = self.__vocabulary.count + a + parentNode[min2i] = self.__vocabulary.count + a + binary[min2i] = 1 + } + for a in 0.. d1{ + i = i + 1 + d1 += pow(Double(self.__vocabulary[i].getCount()), 0.75) / total + } + if i >= self.__vocabulary.count{ + i = self.__vocabulary.count - 1 + } + } + } + + /** + Accessor for the unigram table. + + - Parameter index : Index of the word. + + - Returns: Unigram table value at a given index. + */ + public func getTableValue(index: Int) -> Int{ + return self.__table[index] + } + + /** + Returns size of the unigram table. + + - Returns: Size of the unigram table. + */ + public func getTableSize() -> Int{ + return self.__table.count + } +} diff --git a/Sources/WordToVec/VocabularyWord.swift b/Sources/WordToVec/VocabularyWord.swift new file mode 100644 index 0000000..0793678 --- /dev/null +++ b/Sources/WordToVec/VocabularyWord.swift @@ -0,0 +1,113 @@ +// +// File.swift +// +// +// Created by Olcay Taner YILDIZ on 14.09.2020. +// + +import Foundation +import Dictionary + +public class VocabularyWord : Word{ + + private var __count: Int + private var __code: [Int] + private var __poInt: [Int] + private var __codeLength: Int + public static var MAX_CODE_LENGTH = 40 + + /** + Constructor for a VocabularyWord. The constructor gets name and count values and sets the corresponding + attributes. It also initializes the code and poInt arrays for this word. + + - Parameters: + - name : Lemma of the word + - count : Number of occurrences of this word in the corpus + */ + public init(name: String, count: Int){ + self.__count = count + self.__code = Array(repeating: 0, count: VocabularyWord.MAX_CODE_LENGTH) + self.__poInt = Array(repeating: 0, count: VocabularyWord.MAX_CODE_LENGTH) + self.__codeLength = 0 + super.init(name: name) + } + + public static func < (lhs: VocabularyWord, rhs: VocabularyWord) -> Bool { + return lhs.__count < rhs.__count + } + + public static func == (lhs: VocabularyWord, rhs: VocabularyWord) -> Bool { + return lhs.__count == rhs.__count + } + + /** + Accessor for the count attribute. + + - Returns: Number of occurrences of this word. + */ + public func getCount() -> Int{ + return self.__count + } + + /** + Mutator for codeLength attribute. + + - Parameter codeLength : New value for the codeLength. + */ + public func setCodeLength(codeLength: Int){ + self.__codeLength = codeLength + } + + /** + Mutator for code attribute. + + - Parameters: + - index : Index of the code + - value : New value for that indexed element of code. + */ + public func setCode(index: Int, value: Int){ + self.__code[index] = value + } + + /** + Mutator for poInt attribute. + + - Parameters: + - index : Index of the poInt + - value : New value for that indexed element of poInt. + */ + public func setPoint(index: Int, value: Int){ + self.__poInt[index] = value + } + + /** + Accessor for the codeLength attribute. + + - Returns: Length of the Huffman code for this word. + */ + public func getCodeLength() -> Int{ + return self.__codeLength + } + + /** + Accessor for poInt attribute. + + - Parameter index : Index of the poInt. + + - Returns: Value for that indexed element of poInt. + */ + public func getPoint(index: Int) -> Int{ + return self.__poInt[index] + } + + /** + Accessor for code attribute. + + - Parameter index : Index of the code. + + - Returns: Value for that indexed element of code. + */ + public func getCode(index: Int) -> Int{ + return self.__code[index] + } +} diff --git a/Sources/WordToVec/WordToVec.swift b/Sources/WordToVec/WordToVec.swift deleted file mode 100644 index 2e1b385..0000000 --- a/Sources/WordToVec/WordToVec.swift +++ /dev/null @@ -1,3 +0,0 @@ -struct WordToVec { - var text = "Hello, World!" -} diff --git a/Sources/WordToVec/WordToVecParameter.swift b/Sources/WordToVec/WordToVecParameter.swift new file mode 100644 index 0000000..b3e248a --- /dev/null +++ b/Sources/WordToVec/WordToVecParameter.swift @@ -0,0 +1,154 @@ +// +// File.swift +// +// +// Created by Olcay Taner YILDIZ on 14.09.2020. +// + +import Foundation + +public class WordToVecParameter{ + + private var __layerSize: Int = 100 + private var __cbow: Bool = true + private var __alpha: Double = 0.025 + private var __window: Int = 5 + private var __hierarchicalSoftMax: Bool = false + private var __negativeSamplingSize: Int = 5 + private var __numberOfIterations: Int = 3 + + /** + Empty constructor for Word2Vec parameter + */ + public init(){ + } + + /** + Accessor for layerSize attribute. + + - Returns: Size of the word vectors. + */ + public func getLayerSize() -> Int{ + return self.__layerSize + } + + /** + Accessor for CBow attribute. + + - Returns: True is CBow will be applied, false otherwise. + */ + public func isCbow() -> Bool{ + return self.__cbow + } + + /** + Accessor for the alpha attribute. + + - Returns: Current learning rate alpha. + */ + public func getAlpha() -> Double{ + return self.__alpha + } + + /** + Accessor for the window size attribute. + + - Returns: Current window size. + */ + public func getWindow() -> Int{ + return self.__window + } + + /** + Accessor for the hierarchicalSoftMax attribute. + + - Returns: If hierarchical softmax will be applied, returns true; false otherwise. + */ + public func isHierarchicalSoftMax() -> Bool{ + return self.__hierarchicalSoftMax + } + + /** + Accessor for the negativeSamplingSize attribute. + + RETURNS + ------- + Int + Number of negative samples that will be withdrawn. + */ + public func getNegativeSamplingSize() -> Int{ + return self.__negativeSamplingSize + } + + /** + Accessor for the numberOfIterations attribute. + + - Returns: Number of epochs to train the network. + */ + public func getNumberOfIterations() -> Int{ + return self.__numberOfIterations + } + + /** + Mutator for the layerSize attribute. + + - Parameter layerSize : New size of the word vectors. + */ + public func setLayerSize(layerSize: Int){ + self.__layerSize = layerSize + } + + /** + Mutator for cBow attribute + + - Parameter cbow : True if CBow applied; false if SkipGram applied. + */ + public func setCbow(cbow: Bool){ + self.__cbow = cbow + } + + /** + Mutator for alpha attribute + + - Parameter alpha : New learning rate. + */ + public func setAlpha(alpha: Double){ + self.__alpha = alpha + } + + /** + Mutator for the window size attribute. + + - Parameter window : New window size. + */ + public func setWindow(window: Int){ + self.__window = window + } + + /** + Mutator for the hierarchicalSoftMax attribute. + + - Parameter hierarchicalSoftMax : True is hierarchical softMax applied; false otherwise. + */ + public func setHierarchialSoftMax(hierarchicalSoftMax: Bool){ + self.__hierarchicalSoftMax = hierarchicalSoftMax + } + + /** + Mutator for the negativeSamplingSize attribute. + + - Parameter negativeSamplingSize : New number of negative instances that will be withdrawn. + */ + public func setNegativeSamplingSize(negativeSamplingSize: Int){ + self.__negativeSamplingSize = negativeSamplingSize + } + + /** + Mutator for the numberOfIterations attribute. + + - Parameter numberOfIterations : New number of iterations. + */ + public func setNumberOfIterations(numberOfIterations: Int){ + self.__numberOfIterations = numberOfIterations + } +} diff --git a/Tests/WordToVecTests/NeuralNetworkTest.swift b/Tests/WordToVecTests/NeuralNetworkTest.swift new file mode 100644 index 0000000..51b65b1 --- /dev/null +++ b/Tests/WordToVecTests/NeuralNetworkTest.swift @@ -0,0 +1,44 @@ +import XCTest +import Corpus +import Dictionary +@testable import WordToVec + +final class NeuralNetworkTest: XCTestCase { + private var turkish: Corpus = Corpus() + private var english: Corpus = Corpus() + + override func setUp() { + self.english = Corpus(fileName: "english-similarity-dataset.txt"); + self.turkish = Corpus(fileName: "turkish-similarity-dataset.txt"); + } + + private func train(corpus: Corpus, cBow: Bool) -> VectorizedDictionary{ + let parameter = WordToVecParameter() + parameter.setCbow(cbow: cBow) + let neuralNetwork = NeuralNetwork(corpus: corpus, parameter: parameter) + return neuralNetwork.train() + } + + public func testTrainEnglishCBow(){ + let dictionary = self.train(corpus: self.english, cBow: true) + } + + public func testTrainEnglishSkipGram(){ + let dictionary = self.train(corpus: self.english, cBow: false) + } + + public func testTrainTurkishCBow(){ + let dictionary = self.train(corpus: self.turkish, cBow: true) + } + + public func testTrainTurkishSkipGram(){ + let dictionary = self.train(corpus: self.turkish, cBow: false) + } + + static var allTests = [ + ("testExample1", testTrainEnglishCBow), + ("testExample2", testTrainEnglishSkipGram), + ("testExample3", testTrainTurkishCBow), + ("testExample4", testTrainTurkishSkipGram), + ] +}