forked from zelandiya/RAKE-tutorial
-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest-TF-IDF.py
39 lines (26 loc) · 9.65 KB
/
test-TF-IDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#
# stop word list from SMART (Salton,1971). Available at ftp://ftp.cs.cornell.edu/pub/smart/english.stop
#
stoplist = ["a", "a's", "able", "about", "above", "according", "accordingly", "across", "actually", "after", "afterwards", "again", "against", "ain't", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "aren't", "around", "as", "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "b", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both", "brief", "but", "by", "c", "c'mon", "c's", "came", "can", "can't", "cannot", "cant", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn't", "course", "currently", "d", "definitely", "described", "despite", "did", "didn't", "different", "do", "does", "doesn't", "doing", "don't", "done", "down", "downwards", "during", "e", "each", "edu", "eg", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "f", "far", "few", "fifth", "first", "five", "followed", "following", "follows", "for", "former", "formerly", "forth", "four", "from", "further", "furthermore", "g", "get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn't", "happens", "hardly", "has", "hasn't", "have", "haven't", "having", "he", "he's", "hello", "help", "hence", "her", "here", "here's", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how", "howbeit", "however", "i", "i'd", "i'll", "i'm", "i've", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "is", "isn't", "it", "it'd", "it'll", "it's", "its", "itself", "j", "just", "k", "keep", "keeps", "kept", "know", "knows", "known", "l", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "let's", "like", "liked", "likely", "little", "look", "looking", "looks", "ltd", "m", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", "merely", "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "n", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone", "nor", "normally", "not", "nothing", "novel", "now", "nowhere", "o", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "own", "p", "particular", "particularly", "per", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably", "provides", "q", "que", "quite", "qv", "r", "rather", "rd", "re", "really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively", "right", "s", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "she", "should", "shouldn't", "since", "six", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "sub", "such", "sup", "sure", "t", "t's", "take", "taken", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "that's", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "there's", "thereafter", "thereby", "therefore", "therein", "theres", "thereupon", "these", "they", "they'd", "they'll", "they're", "they've", "think", "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "twice", "two", "u", "un", "under", "unfortunately", "unless", "unlikely", "until", "unto", "up", "upon", "us", "use", "used", "useful", "uses", "using", "usually", "uucp", "v", "value", "various", "very", "via", "viz", "vs", "w", "want", "wants", "was", "wasn't", "way", "we", "we'd", "we'll", "we're", "we've", "welcome", "well", "went", "were", "weren't", "what", "what's", "whatever", "when", "whence", "whenever", "where", "where's", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "who's", "whoever", "whole", "whom", "whose", "why", "will", "willing", "wish", "with", "within", "without", "won't", "wonder", "would", "would", "wouldn't", "x", "y", "yes", "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "z", "zero"]
text = "Background and aims. The ATP-binding cassette (ABC) transporter family transports various molecules across the enterocytes in the gut protecting the intestine against potentially harmful substances. Moreover, ABC transporters are involved in mucosal immune defence through interaction with cytokines. The study aimed to assess whether polymorphisms in ABCB1, ABCC2 and ABCG2 were associated with risk of colorectal cancer (CRC) and to investigate gene-environment (dietary factors, smoking and use of non-steroidal anti-inflammatory drugs) and gene-gene interactions between previously studied polymorphisms in IL1B and IL10 and ABC transporter genes in relation to CRC risk. Materials and methods. We used a Danish prospective case-cohort study of 1010 CRC cases and 1829 randomly selected participants from the Danish Diet, Cancer and Health cohort. Incidence rate ratios were calculated based on Cox proportional hazards model. Results. None of the polymorphisms were associated with CRC, but ABCB1 and ABCG2 haplotypes were associated with risk of CRC. ABCB1/rs1045642 interacted with intake of cereals and fiber (p-Value for interaction (Pint) = 0.001 and 0.01, respectively). In a three-way analysis, both ABCB1/rs1045642 and ABCG2/rs2231137 in combination with IL10/rs3024505 interacted with fiber intake in relation to risk of CRC (Pint = 0.0007 and 0.009). Conclusions. Our results suggest that the ABC transporters P-glycoprotein/multidrug resistance 1 and BRCP, in cooperation with IL-10, are involved in the biological mechanism underlying the protective effect of fiber intake in relation to CRC. These results should be replicated in other cohorts to rule out chance findings. © 2015 Informa Healthcare. Background: The etiology of the inflammatory bowel diseases, including ulcerative colitis (UC), remains incompletely explained. We hypothesized that an analysis of the UC colon proteome could reveal novel insights into the disease etiology. Methods: Mucosal colon biopsies were taken by endoscopy from noninflamed tissue of 10 patients with UC and 10 controls. The biopsies were either snap-frozen for protein analysis or prepared for histology. The protein content of the biopsies was characterized by high-throughput gel-free quantitative proteomics, and biopsy histology was analyzed by light microscopy and confocal microscopy. Results: We identified and quantified 5711 different proteins with proteomics. The abundance of the proteins calprotectin and lactotransferrin in the tissue correlated with the degree of tissue inflammation as determined by histology. However, fecal calprotectin did not correlate. Forty-six proteins were measured with a statistically significant differences in abundances between the UC colon tissue and controls. Eleven of the proteins with increased abundances in the UC biopsies were associated with neutrophils and neutrophil extracellular traps. The findings were validated by microscopy, where an increased abundance of neutrophils and the presence of neutrophil extracellular traps by extracellular DNA present in the UC colon tissue were confirmed. Conclusions: Neutrophils, induced neutrophil extracellular traps, and several proteins that play a part in innate immunity are all increased in abundance in the morphologically normal colon mucosa from patients with UC."
from sklearn.feature_extraction.text import TfidfVectorizer
# Two sets of documents
# plays_corpus contains all documents in your corpus *including Romeo and Juliet*
plays_corpus = ['This is Romeo and Juliet','this is another play','and another','and one more']
#romeo is a list that contains *just* the text for Romeo and Juliet
romeo = [plays_corpus[0]] # must be in a list even if only one object
# Initialise your TFIDF Vectorizer object
#tfidf_vectorizer = TfidfVectorizer()
v = TfidfVectorizer(stop_words=stoplist)
# Now create a model by fitting the vectorizer to your main plays corpus, this creates an array of TFIDF scores
#model = tfidf_vectorizer.fit_transform(plays_corpus)
model = v.fit_transform([text])
#romeo_scored = tfidf_vectorizer.transform(romeo) # note - .fit() not .fit_transform
doc = v.transform([text])
#terms = tfidf_vectorizer.get_feature_names()
terms = v.get_feature_names()
#scores = romeo_scored.toarray().flatten().tolist()
scores = doc.toarray().flatten().tolist()
data = list(zip(scores, terms))
sorted_data = sorted(data,key=lambda x: x[0],reverse=True)
print(sorted_data)