From 1cd367889d29ac2978cf54b17f099558a985153f Mon Sep 17 00:00:00 2001 From: Olcay Taner YILDIZ Date: Thu, 21 Apr 2022 14:52:43 +0300 Subject: [PATCH] Added wordCount method and tests for TreeBank. --- Sources/ParseTree/ParseNode.swift | 42 +++++++++++++++++++++++++ Sources/ParseTree/ParseTree.swift | 10 ++++++ Sources/ParseTree/TreeBank.swift | 20 +++++++++--- Tests/ParseTreeTests/TreeBankTest.swift | 27 ++++++++++++++++ 4 files changed, 95 insertions(+), 4 deletions(-) create mode 100644 Tests/ParseTreeTests/TreeBankTest.swift diff --git a/Sources/ParseTree/ParseNode.swift b/Sources/ParseTree/ParseNode.swift index 323ee4e..9a18713 100644 --- a/Sources/ParseTree/ParseNode.swift +++ b/Sources/ParseTree/ParseNode.swift @@ -544,6 +544,48 @@ open class ParseNode : Equatable{ self.data = data } + /** + * Recursive function to count the number of words in the subtree rooted at this node. + - Parameters: + - excludeStopWords If true, stop words are not counted. + - Returns: Number of words in the subtree rooted at this node. + */ + public func wordCount(excludeStopWords: Bool) -> Int{ + var sum = 0 + if children?.count == 0{ + if !excludeStopWords{ + sum = 1 + } else { + if Word.isPunctuationSymbol(surfaceForm: (data?.getName())!) || (data?.getName())!.contains("*") || data?.getName().lowercased() == "at" || + data?.getName().lowercased() == "the" || data?.getName().lowercased() == "to" || data?.getName().lowercased() == "a" || + data?.getName().lowercased() == "an" || data?.getName().lowercased() == "not" || data?.getName().lowercased() == "is" || + data?.getName().lowercased() == "was" || data?.getName().lowercased() == "were" || data?.getName().lowercased() == "have" || + data?.getName().lowercased() == "had" || data?.getName().lowercased() == "has" || data?.getName().lowercased() == "by" || + data?.getName().lowercased() == "on" || data?.getName().lowercased() == "off" || data?.getName().lowercased() == "'s" || + data?.getName().lowercased() == "n't" || data?.getName().lowercased() == "can" || data?.getName().lowercased() == "could" || + data?.getName().lowercased() == "may" || data?.getName().lowercased() == "might" || data?.getName().lowercased() == "will" || + data?.getName().lowercased() == "would" || data?.getName().lowercased() == "as" || data?.getName().lowercased() == "with" || + data?.getName().lowercased() == "for" || data?.getName().lowercased() == "will" || data?.getName().lowercased() == "would" || + data?.getName().lowercased() == "than" || data?.getName().lowercased() == "$" || data?.getName().lowercased() == "and" || + data?.getName().lowercased() == "or" || data?.getName().lowercased() == "of" || data?.getName().lowercased() == "are" || + data?.getName().lowercased() == "be" || data?.getName().lowercased() == "been" || data?.getName().lowercased() == "do" || + data?.getName().lowercased() == "few" || data?.getName().lowercased() == "there" || data?.getName().lowercased() == "up" || + data?.getName().lowercased() == "down" || data?.getName().lowercased() == "in" || data?.getName().lowercased() == "'re"{ + sum = 0 + + } else { + sum = 1 + } + } + } else { + sum = 0 + } + for aChild in children!{ + sum = sum + aChild.wordCount(excludeStopWords: excludeStopWords) + } + return sum + } + /** * Construct recursively the constituent span list of a subtree rooted at this node. - Parameters: diff --git a/Sources/ParseTree/ParseTree.swift b/Sources/ParseTree/ParseTree.swift index e28b774..6214620 100644 --- a/Sources/ParseTree/ParseTree.swift +++ b/Sources/ParseTree/ParseTree.swift @@ -157,4 +157,14 @@ open class ParseTree{ public func toSentence() -> String{ return (root?.toSentence().trimmingCharacters(in: .whitespacesAndNewlines))! } + + /** + * Calls recursive function to count the number of words in the tree. + - Parameters: + - excludeStopWords If true, stop words are not counted. + - Returns: Number of words in the tree. + */ + public func wordCount(excludeStopWords: Bool) -> Int{ + return (root?.wordCount(excludeStopWords: excludeStopWords))! + } } diff --git a/Sources/ParseTree/TreeBank.swift b/Sources/ParseTree/TreeBank.swift index 78e4ddc..a548bbb 100644 --- a/Sources/ParseTree/TreeBank.swift +++ b/Sources/ParseTree/TreeBank.swift @@ -26,8 +26,7 @@ open class TreeBank { do { let listOfFiles = try fileManager.contentsOfDirectory(atPath: folder) for file in listOfFiles { - let thisSourceFile = URL(fileURLWithPath: #file) - let thisDirectory = thisSourceFile.deletingLastPathComponent() + let thisDirectory = URL(fileURLWithPath: folder) let url = thisDirectory.appendingPathComponent(file) let parseTree = ParseTree(url: url) if parseTree.getRoot() != nil{ @@ -52,8 +51,7 @@ open class TreeBank { let listOfFiles = try fileManager.contentsOfDirectory(atPath: folder) for file in listOfFiles { if file.contains(pattern){ - let thisSourceFile = URL(fileURLWithPath: #file) - let thisDirectory = thisSourceFile.deletingLastPathComponent() + let thisDirectory = URL(fileURLWithPath: folder) let url = thisDirectory.appendingPathComponent(file) let parseTree = ParseTree(url: url) if parseTree.getRoot() != nil{ @@ -82,6 +80,20 @@ open class TreeBank { return parseTrees.count } + /** + * Calls recursive function to count the number of words in the treeBank. + - Parameters: + - excludeStopWords If true, stop words are not counted. + - Returns: Number of words in the treeBank. + */ + public func wordCount(excludeStopWords: Bool) -> Int{ + var count = 0 + for tree in parseTrees{ + count = count + tree.wordCount(excludeStopWords: excludeStopWords) + } + return count + } + /** * Accessor for a single ParseTree. - Parameters: diff --git a/Tests/ParseTreeTests/TreeBankTest.swift b/Tests/ParseTreeTests/TreeBankTest.swift new file mode 100644 index 0000000..b7165d7 --- /dev/null +++ b/Tests/ParseTreeTests/TreeBankTest.swift @@ -0,0 +1,27 @@ +import XCTest +@testable import ParseTree + +final class TreeBankTest: XCTestCase { + + var treeBank1 : TreeBank = TreeBank() + + override func setUp() { + let thisSourceFile = URL(fileURLWithPath: #file) + let thisDirectory = thisSourceFile.deletingLastPathComponent() + treeBank1 = TreeBank(folder: thisDirectory.appendingPathComponent("trees").path) + } + + func testSize() { + XCTAssertEqual(5, treeBank1.size()) + } + + func testWordCount() { + XCTAssertEqual(30, treeBank1.wordCount(excludeStopWords: true)) + XCTAssertEqual(52, treeBank1.wordCount(excludeStopWords: false)) + } + + static var allTests = [ + ("testExample1", testSize), + ("testExample2", testWordCount), + ] +}