diff --git a/CHANGELOG.md b/CHANGELOG.md index 6fbd795e79..77dba39b1a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,8 @@ -## 0.13.8-dev3 +## 0.13.8-dev9 ### Enhancements -**Faster evaluation** Support for concurrent processing of documents during evaluation +* **Faster evaluation** Support for concurrent processing of documents during evaluation ### Features @@ -10,8 +10,12 @@ * **Add missing starting_page_num param to partition_image** * **Make the filename and file params for partition_image and partition_pdf match the other partitioners** +* **Fix include_slide_notes and include_page_breaks params in partition_ppt** * **Re-apply: skip accuracy calculation feature** Overwritten by mistake -* **AstraDB: opton to prevent indexing metadata** +* **Fix type hint for paragraph_grouper param** `paragraph_grouper` can be set to `False`, but the type hint did not not reflect this previously. +* **Remove links param from partition_pdf** `links` is extracted during partitioning and is not needed as a paramter in partition_pdf. +* **Improve CSV delimeter detection.** `partition_csv()` would raise on CSV files with very long lines. +* **AstraDB: option to prevent indexing metadata** ## 0.13.7 diff --git a/docs/requirements.txt b/docs/requirements.txt index 51ada53f22..43b2b2232a 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -48,7 +48,7 @@ markdown-it-py==3.0.0 # myst-parser markupsafe==2.1.5 # via jinja2 -mdit-py-plugins==0.4.0 +mdit-py-plugins==0.4.1 # via myst-parser mdurl==0.1.2 # via markdown-it-py diff --git a/example-docs/csv-with-long-lines.csv b/example-docs/csv-with-long-lines.csv new file mode 100644 index 0000000000..421f2533f6 --- /dev/null +++ b/example-docs/csv-with-long-lines.csv @@ -0,0 +1,11 @@ +SpearmanCorrelationCoefficient,1Freq CD161CD8TfrequencyUnstim,1Freq Ki67CD8frequencyUnstim,1Freq pDCfrequencyUnstim,1Freq GranulocytesfrequencyUnstim,2Unstim BasophilsCREBUnstim,2Unstim BasophilsSTAT5Unstim,2Unstim BasophilsS6Unstim,2Unstim BasophilsP38Unstim,2Unstim BasophilsZap70_SykUnstim,2Unstim Basophils4EBP1Unstim,2Unstim CD4TeffSTAT3Unstim,2Unstim CD4Teff4EBP1Unstim,2Unstim CD4TmemCD38Ki67Ki67Unstim,2Unstim CD4TnaiveCD38Ki67CD38Unstim,2Unstim CD4TnaiveCD38Ki67STAT1Unstim,2Unstim CD4TnaiveCD38Ki67STAT4Unstim,2Unstim CD4TnaiveCD38Ki67HLADRUnstim,2Unstim CD8TeffCD38Unstim,2Unstim CD8TeffHLADRUnstim,2Unstim CD8TmemCD38Ki67CREBUnstim,2Unstim CD8TnaiveCD38Ki67CD38Unstim,2Unstim CD8TnaiveCD38Ki67CREBUnstim,2Unstim CD8TnaiveCD38Ki67STAT5Unstim,2Unstim CD8TnaiveCD38Ki67IkBaUnstim,2Unstim CD8TnaiveCD38Ki67Ki67Unstim,2Unstim CD8TnaiveCD38Ki674EBP1Unstim,2Unstim CD8TnaiveCD38Ki67STAT6Unstim,2Unstim CD38BcellIgMUnstim,2Unstim CD38BcellHLADRUnstim,2Unstim CD45RACD14negCD123neg4EBP1Unstim,2Unstim CD56brightCD16negNKS6Unstim,2Unstim CD56dimCD16posNKPLCg2Unstim,2Unstim CD161CD8TSTAT1Unstim,2Unstim CD161CD8TTBK1Unstim,2Unstim GranulocytesCD38Unstim,2Unstim GranulocytesSTAT5Unstim,2Unstim GranulocytesErk1_2Unstim,2Unstim GranulocytesIkBaUnstim,2Unstim Granulocytes4EBP1Unstim,2Unstim GranulocytesHLADRUnstim,2Unstim IgMnBcellIgMUnstim,2Unstim IgMnBcellHLADRUnstim,2Unstim intMCSTAT3Unstim,2Unstim intMCCREBUnstim,2Unstim intMCP38Unstim,2Unstim intMCHLADRUnstim,2Unstim intMCSTAT6Unstim,2Unstim Ki67CD8STAT3Unstim,2Unstim Ki67CD8CREBUnstim,2Unstim Ki67CD8S6Unstim,2Unstim Ki67CD8IkBaUnstim,2Unstim Ki67CD38CD8TCD38Unstim,2Unstim Ki67CD38CD8TSTAT1Unstim,2Unstim Ki67CD38CD8TZap70_SykUnstim,2Unstim mDCBDCA3CD38Unstim,2Unstim mDCBDCA3S6Unstim,2Unstim mDCBDCA3HLADRUnstim,2Unstim MDSC4EBP1Unstim,2Unstim MDSCHLADRUnstim,2Unstim ncMCSTAT3Unstim,2Unstim ncMCSTAT5Unstim,2Unstim ncMCIkBaUnstim,2Unstim NKTKi67STAT3Unstim,2Unstim pDCCD38Unstim,2Unstim pDCSTAT3Unstim,2Unstim pDCS6Unstim,2Unstim pDCP38Unstim,2Unstim pDCHLADRUnstim,2Unstim PlasmablastSTAT6Unstim,2Unstim TregSTAT4Unstim,2Unstim TregHLADRUnstim,2Unstim BcellIgMUnstim,2Unstim mDCIkBaUnstim,2Unstim mDC4EBP1Unstim,2Unstim NKcellHLADRUnstim,2Unstim CD4TnaiveTBK1Unstim,2Unstim CD4Tnaive4EBP1Unstim,2Unstim CD8TmemIkBaUnstim,2Unstim CD8TmemHLADRUnstim,2Unstim NKTSTAT3Unstim,2Unstim NKTCREBUnstim,3LPS CD38BcellCD38LPSCI,3LPS CD38BcellHLADRLPSCI,3LPS CD45RACD14negCD123negCD38LPSCI,3LPS CD45RACD14negCD123negS6LPSCI,3LPS CD45RACD14negCD123neg4EBP1LPSCI,3LPS CD56dimCD16posNKCD38LPSCI,3LPS GranulocytesCD38LPSCI,3LPS GranulocytesTBK1LPSCI,3LPS mDCCD1c4EBP1LPSCI,3LPS mDCCD1cHLADRLPSCI,3LPS MDSCCD38LPSCI,3LPS MDSCIkBaLPSCI,3LPS ncMCCD38LPSCI,3LPS ncMCErk1_2LPSCI,3LPS ncMCIkBaLPSCI,3LPS ncMCHLADRLPSCI,3LPS NKTKi67CD38LPSCI,3LPS pDCS6LPSCI,3LPS pDCKi67LPSCI,3LPS PlasmablastZap70_SykLPSCI,3LPS PlasmablastHLADRLPSCI,3LPS TregHLADRLPSCI,3LPS DCCD38LPSCI,3LPS mDCHLADRLPSCI,3LPS NKcellZap70_SykLPSCI,4IFN BasophilsTBK1IFNIL,4IFN BasophilsIkBaIFNIL,4IFN CD4nCD8nTCD38IFNIL,4IFN CD4nCD8nTZap70_SykIFNIL,4IFN CD4nCD8nTHLADRIFNIL,4IFN CD4TmemCD38Ki67CD38IFNIL,4IFN CD4TmemCD38Ki67PLCg2IFNIL,4IFN CD4TmemCD38Ki67Ki67IFNIL,4IFN CD4TmemCD38Ki67HLADRIFNIL,4IFN CD4TnaiveCD38Ki67Zap70_SykIFNIL,4IFN CD4TnaiveCD38Ki67Ki67IFNIL,4IFN CD4TnaiveCD38Ki674EBP1IFNIL,4IFN CD4TnaiveCD38Ki67HLADRIFNIL,4IFN CD8TeffSTAT5IFNIL,4IFN CD8TmemCD38Ki67CD38IFNIL,4IFN CD8TmemCD38Ki67CREBIFNIL,4IFN CD8TmemCD38Ki67STAT5IFNIL,4IFN CD8TmemCD38Ki67IkBaIFNIL,4IFN CD8TmemCD38Ki674EBP1IFNIL,4IFN CD8TmemCD38Ki67HLADRIFNIL,4IFN CD8TnaiveCD38Ki67STAT3IFNIL,4IFN CD8TnaiveCD38Ki67CREBIFNIL,4IFN CD8TnaiveCD38Ki67S6IFNIL,4IFN CD8TnaiveCD38Ki67Zap70_SykIFNIL,4IFN CD8TnaiveCD38Ki67IkBaIFNIL,4IFN CD8TnaiveCD38Ki67Ki67IFNIL,4IFN CD8TnaiveCD38Ki674EBP1IFNIL,4IFN CD8TnaiveCD38Ki67HLADRIFNIL,4IFN CD56brightCD16negNKS6IFNIL,4IFN GranulocytesCD38IFNIL,4IFN GranulocytesSTAT5IFNIL,4IFN GranulocytesS6IFNIL,4IFN GranulocytesIkBaIFNIL,4IFN GranulocytesKi67IFNIL,4IFN GranulocytesHLADRIFNIL,4IFN GranulocytesSTAT6IFNIL,4IFN IgMpBcellCD38IFNIL,4IFN Ki67CD8STAT5IFNIL,4IFN Ki67CD8STAT6IFNIL,4IFN Ki67CD38CD4TKi67IFNIL,4IFN Ki67CD38CD4THLADRIFNIL,4IFN Ki67CD38CD8TCD38IFNIL,4IFN Ki67CD38CD8TSTAT3IFNIL,4IFN Ki67CD38CD8TSTAT5IFNIL,4IFN Ki67CD38CD8TIkBaIFNIL,4IFN Ki67CD38CD8TSTAT4IFNIL,4IFN Ki67CD38CD8T4EBP1IFNIL,4IFN Ki67CD38CD8TSTAT6IFNIL,4IFN mDCCD1cIkBaIFNIL,4IFN MDSC4EBP1IFNIL,4IFN MDSCHLADRIFNIL,4IFN ncMCHLADRIFNIL,4IFN ncMCSTAT6IFNIL,4IFN pDCSTAT1IFNIL,4IFN pDCS6IFNIL,4IFN pDCKi67IFNIL,4IFN pDCSTAT6IFNIL,4IFN PlasmablastCD38IFNIL,4IFN PlasmablastS6IFNIL,4IFN TregCREBIFNIL,4IFN TregHLADRIFNIL,4IFN cMC4EBP1IFNIL,4IFN DCCD38IFNIL,4IFN DCCREBIFNIL,4IFN mDCSTAT1IFNIL,4IFN TCD38IFNIL,4IFN CD4TCD38IFNIL,4IFN CD4TmemCD38IFNIL,4IFN CD8TmemHLADRIFNIL,5PI CD4nCD8nTCD38PI,5PI CD4TeffHLADRPI,5PI CD4TmemCD38Ki67HLADRPI,5PI CD8TeffZap70_SykPI,5PI CD38BcellCD38PI,5PI CD56brightCD16negNKCD38PI,5PI CD56brightCD16negNKP38PI,5PI CD56dimCD16posNKZap70_SykPI,5PI CD161CD8TMAPKAPK2PI,5PI GranulocytesSTAT3PI,5PI GranulocytesS6PI,5PI GranulocytesErk1_2PI,5PI GranulocytesP38PI,5PI GranulocytesIkBaPI,5PI Granulocytes4EBP1PI,5PI IgMpBcellHLADRPI,5PI Ki67CD4THLADRPI,5PI Ki67CD8STAT1PI,5PI Ki67CD8TBK1PI,5PI Ki67CD38CD4TKi67PI,5PI Ki67CD38CD4THLADRPI,5PI Ki67CD38CD8TCD38PI,5PI Ki67CD38CD8TSTAT3PI,5PI Ki67CD38CD8THLADRPI,5PI mDCCD1cHLADRPI,5PI NKTKi67STAT1PI,5PI NKTKi67HLADRPI,5PI PlasmablastHLADRPI,5PI PlateletsSTAT3PI,5PI PlateletsErk1_2PI,5PI PlateletsIkBaPI,5PI PlateletsHLADRPI,5PI TregSTAT3PI,5PI TregHLADRPI,5PI cMCHLADRPI,5PI mDCSTAT3PI,5PI mDCPLCg2PI,5PI mDCErk1_2PI,5PI mDCKi67PI,5PI mDCHLADRPI,5PI NKcellS6PI,5PI TCD38PI,5PI CD4TnaiveTBK1PI,5PI CD4Tnaive4EBP1PI,5PI CD8TnaiveIkBaPI,5PI NKTS6PI,5PI NKTIkBaPI,6Prot ACE2,6Prot ACP5,6Prot ACP6,6Prot ACTN4,6Prot ADA,6Prot ADAMTS13,6Prot ADAMTS8,6Prot ADGRG1,6Prot ALDH3A1,6Prot AMBN,6Prot ANGPTL3,6Prot ANXA5,6Prot AOC1,6Prot AZU1,6Prot BAIAP2,6Prot CALB2,6Prot CBLN4,6Prot CCL15,6Prot CCL27,6Prot CD209,6Prot CD276,6Prot CDCP1,6Prot CDHR1,6Prot CDHR2,6Prot CEACAM5,6Prot CLEC10A,6Prot CLEC4G,6Prot CLSPN,6Prot CNTN4,6Prot CNTN5,6Prot COL1A1,6Prot CPVL,6Prot CRH,6Prot CRHR1,6Prot CRLF1,6Prot CRNN,6Prot CTSB,6Prot CTSH,6Prot CX3CL1,6Prot DCBLD2,6Prot DDAH1,6Prot DEFB4A_DEFB4B,6Prot DPEP2,6Prot DSG4,6Prot ECE1,6Prot EPHA10,6Prot FABP9,6Prot FCGR2B,6Prot FCN2,6Prot FCRLB,6Prot FKBP7,6Prot FLT3LG,6Prot FLT4,6Prot FOLR1,6Prot FUCA1,6Prot FUT8,6Prot GALNT10,6Prot GALNT7,6Prot GCG,6Prot GCNT1,6Prot GFOD2,6Prot GGA1,6Prot GGH,6Prot GH2,6Prot GHRHR,6Prot GZMH,6Prot HPGDS,6Prot HSD11B1,6Prot HYAL1,6Prot IDS,6Prot IDUA,6Prot IFNG,6Prot IFNGR2,6Prot IL13,6Prot IL15,6Prot IL15RA,6Prot IL17F,6Prot IL17RA,6Prot IL19,6Prot IL1B,6Prot IL1RAP,6Prot IL22RA1,6Prot IL24,6Prot IL33,6Prot IL4,6Prot IL5,6Prot ITGB6,6Prot ITIH3,6Prot KCNIP4,6Prot KDR,6Prot KIR3DL1,6Prot L1CAM,6Prot LAMP3,6Prot LEP,6Prot LGALS1,6Prot LGALS9,6Prot LHB,6Prot LRP1,6Prot LRPAP1,6Prot LY75,6Prot LY9,6Prot MDK,6Prot MERTK,6Prot MLN,6Prot MME,6Prot MSLN,6Prot NGF,6Prot NID1,6Prot NINJ1,6Prot NTRK3,6Prot OPTC,6Prot PADI2,6Prot PDCD1,6Prot PDCD1LG2,6Prot PDGFC,6Prot PKLR,6Prot PLIN1,6Prot PON2,6Prot PRELP,6Prot PRL,6Prot PRSS27,6Prot PRSS8,6Prot PTPRF,6Prot RANGAP1,6Prot S100A16,6Prot SCGB1A1,6Prot SCLY,6Prot SCP2,6Prot SDC1,6Prot SFRP1,6Prot SFTPA2,6Prot SIGLEC9,6Prot SIRT5,6Prot SLAMF1,6Prot SLC39A5,6Prot SOST,6Prot SPOCK1,6Prot TDGF1,6Prot TIGAR,6Prot TNFRSF11A,6Prot TNR,6Prot TP53,6Prot TPP1,6Prot VAT1,6Prot WARS,6Prot WFDC2,6Prot XRCC4 +1Freq CD161CD8TfrequencyUnstim,1,-0.285021097,0.312877946,-0.296319737,0.215627978,0.17301315,0.194282447,0.429845626,0.326592318,0.168591576,-0.030716119,0.084662173,-0.282135586,0.033216783,0.187762238,-0.223208042,0.011101399,0.143489422,-0.182867765,0.375699301,0.074959447,0.224622215,0.084265346,0.149150517,-0.142235123,-0.001536754,0.159566294,0.059111415,0.170580998,0.362384451,0.087596017,0.052358168,0.076948855,-0.173611199,0.326684533,0.181122675,0.156868448,0.277373094,0.237016148,0.163397701,0.258715968,0.120348006,-0.178661733,0.282886216,0.196916435,-0.062843047,0.138822078,0.045865527,0.361903919,0.262787845,0.217010837,0.090460323,0.34154017,0.402783354,0.092508711,0.278222997,0.143031359,0.092437864,-0.289582232,0.106657183,0.241194879,-0.015248933,0.009724453,0.088466222,-0.039134809,0.387290409,0.052079142,0.078973843,-0.089644381,-0.10370642,-0.059244553,0.117768308,0.25767903,0.290405797,-0.307730244,0.042198308,-0.109801591,0.267492002,-0.477234647,0.054464523,0.40580938,0.022146275,-0.051332878,0.137984148,0.271701982,-0.174231032,0.143882433,-0.248773903,0.077170198,0.020851371,0.005699856,-0.047498238,-0.027202255,0.268562111,0.242218997,0.127560071,-0.21053056,0.176760082,-0.002049005,-0.016818919,0.080451128,0.269924812,0.011825017,-0.024709923,-0.383321941,0.097711854,-0.219499341,-0.279841897,-0.192423721,-0.241989904,-0.134051882,-0.121958349,-0.135732554,0.320168067,-0.049141396,-0.432157249,0.06432065,0.01553416,-0.124550125,0.284765431,-0.193939394,-0.278787879,0.324747475,-0.00966811,0.022222222,0.353174603,0.184831051,-0.018556033,0.088566567,-0.212705615,0.020014586,-0.123085649,-0.012235637,0.156470302,0.328963574,-0.299190754,0.15951772,-0.071830471,-0.070917062,-0.096930946,-0.059810011,0.250968213,-0.327543236,0.261052247,0.214724151,0.305334308,-0.136755572,-0.137201692,0.220009578,0.295873573,-0.036475377,0.374850347,0.076941496,0.308324687,-0.348790805,0.205047573,0.349459765,-0.224033286,0.184563118,0.009695047,-0.033869709,-0.261514442,-0.011004508,-0.244752175,-0.033202969,-0.302046036,-0.348666423,-0.017647059,-0.253456449,-0.185202777,0.040811107,-0.36815057,-0.28038892,-0.098536177,-0.096382901,0.391078225,-0.010288936,-0.131836966,-0.033121917,-0.097533474,0.151011779,0.050135911,0.095620113,0.160676533,-0.019591261,0.036081748,-0.057223397,-0.033967583,-0.016208598,-0.131642001,0.006624383,0.018604651,-0.077801268,-0.044256519,0.055673009,-0.194221283,0.08989547,0.189721254,0.033623693,0.189692894,-0.016490486,0.254686399,0.309059233,0.033262861,0.07230444,-0.140803383,-0.053136011,0.114164905,0.006624383,0.185623679,0.120084567,0.12769556,0.080902044,-0.259901339,-0.232135307,0.221987315,0.113248002,-0.162649753,-0.164059197,-0.228188865,0.310782241,-0.125863284,-0.398921707,-0.42721519,-0.212072199,-0.25676428,-0.254078762,0.216080638,-0.078024367,-0.224660103,-0.128364559,0.305133615,-0.216549461,-0.429864041,-0.071659634,-0.343084857,-0.263595874,-0.332817628,-0.096530708,-0.218039252,-0.308251289,-0.184857009,-0.163994374,-0.363877168,0.030661041,-0.333169245,-0.245981282,-0.061978434,-0.283053698,-0.279947726,0.115166432,0.209329583,-0.032770745,-0.196952649,0.273699015,0.065283638,-0.113197375,0.088818565,-0.146155649,-0.429254571,-0.39172527,-0.302320675,-0.380543835,-0.185185185,-0.249296765,0.309188936,-0.197328864,-0.190214429,-0.007032349,-0.097304266,-0.279254571,-0.366315049,-0.213548992,-0.018682607,-0.329910924,-0.322362869,-0.254946085,-0.153211439,-0.369259079,-0.111650912,-0.145874355,-0.323935325,0.054758556,-0.058954524,-0.356633849,0.231879981,0.019104548,-0.20100797,0.140154712,-0.083052039,-0.285818097,-0.259827938,-0.120651664,-0.290765886,-0.33588842,-0.047491796,-0.430707923,-0.306469761,-0.050820441,-0.107712143,-0.110384435,-0.304969526,0.108907642,-0.059399906,-0.258931083,-0.380956288,0.162634787,0.248499766,-0.227449602,-0.29798406,-0.043834974,0.018495077,-0.050797,0.055625879,-0.490881388,-0.366268167,-0.390271917,-0.543369335,-0.123956868,-0.293506798,-0.360806376,0.053750586,-0.193459916,-0.45140647,-0.467314037,-0.187810595,-0.291608064,-0.399132677,-0.308358582,-0.395593061,-0.237132191,0.211157993,-0.045616503,-0.236099391,-0.28870136,-0.337412096,-0.319081106,-0.113173933,-0.253070792,-0.253656821,-0.167909048,-0.324941397,-0.057853807,-0.436099391,-0.44092827,0.150961088,-0.32620722,-0.171801288,-0.310337553,-0.062423816,-0.486615096,-0.498218472,-0.260103141,-0.310548523,-0.142429339,-0.298101266,-0.212353493,-0.170979841,-0.488888889,-0.078527895,-0.231387717,-0.53185654,-0.138654477,-0.176793249,-0.346507267,-0.291912799,-0.452531646,-0.462001875,-0.261720581 +1Freq Ki67CD8frequencyUnstim,-0.285021097,1,-0.326082932,0.117750678,-0.076882028,-0.292243187,-0.099447303,-0.298646846,-0.497615425,-0.258395273,-0.045258539,-0.137116049,0.138987595,0.174475524,-0.175174825,0.01097028,0.280638112,0.229378217,0.086938378,-0.223164336,0.001195253,-0.345855033,-0.341330146,-0.33911039,0.290873389,-0.419704602,-0.371723726,-0.243937116,-0.199781271,-0.097930867,-0.258463727,-0.27548767,-0.353686527,-0.108461159,0.137730826,-0.089946446,-0.120399221,-0.217553258,-0.053383642,-0.071498996,-0.115189873,-0.150316456,-0.063459759,-0.241813136,-0.195066297,0.279432624,-0.190379278,-0.174001947,-0.222906524,-0.166966894,-0.075219085,0.003794891,-0.369770455,-0.435122047,0.308710801,-0.190243902,0.122473868,-0.159527587,0.203747073,-0.12745377,-0.228961593,-0.256216216,-0.091893866,0.063910959,-0.206036217,0.072266935,-0.349698189,0.032226693,-0.162048364,0.034980526,0.057132425,-0.14751704,-0.159871956,-0.238753651,0.03882668,-0.063023369,-0.068743914,-0.186295034,0.049854247,-0.06587147,-0.327458617,0.110799727,-0.247983595,0.165963302,0.188038278,-0.0784689,0.12809296,0.279704028,-0.148803828,0.170707071,0.050721501,0.236786469,0.144890768,0.2823398,0.085470085,0.008466376,-0.101193356,0.095557075,-0.138393238,-0.035345343,-0.297402597,0.06937799,-0.248462064,0.176763103,0.036910458,0.011791042,0.020948617,-0.012121212,0.079188244,0.190673121,-0.316368286,0.123931312,-0.388819876,-0.033284618,-0.156046767,0.025439109,-0.072072349,0.008151589,-0.117782768,-0.158529603,0.184776335,0.169480519,0.026479076,-0.199206349,0.113131313,-0.112265512,0.007535856,0.064095292,0.099586743,0.035248359,-0.006563488,-0.034762175,0.117737623,-0.144315696,-0.23970358,0.224232977,-0.178407015,0.069784436,0.072524662,0.015052978,-0.13737669,-0.274899525,0.063141989,-0.066898064,-0.09810011,-0.045780051,-0.24238217,0.259438104,-0.03396121,-0.075744273,-0.212068002,-0.313353021,0.043020193,-0.241000878,0.035916673,8.06E-05,-0.204644412,0.019620567,-0.021261977,0.116869381,-0.133237642,0.024552391,0.004759387,0.153324288,-0.045893527,0.026123493,-0.156046767,-0.143660943,0.175044047,0.030398246,-0.290464012,0.114912131,0.165144674,0.038637963,-0.144208988,0.134963338,-0.145313601,0.044809983,-0.041296688,0.076250881,0.125037753,-0.062971912,0.003889632,-0.189711064,-0.388724454,-0.128118393,-0.235658915,-0.350951374,0.065539112,-0.41465821,-0.264129669,-0.03030303,-0.31205074,0.124735729,0.217758985,-0.041014799,0.333623693,-0.274216028,0.18902439,-0.100559112,-0.174066244,-0.08893587,0.035714286,-0.151374207,-0.07751938,0.131078224,0.275828048,-0.394221283,-0.134460888,0.072727273,-0.222410148,-0.013812544,-0.021282593,-0.084143763,0.337420719,-0.126990839,0.345518037,0.052854123,-0.316701903,0.13615222,-0.244679352,0.094009866,0.266418248,0.209101174,0.17195122,0.213065792,0.345460705,0.033017164,0.067536515,-0.028726287,0.146613294,-0.166937669,0.059665763,0.210840108,0.049616079,0.181436314,0.197877145,0.224480578,-0.102303523,0.241916452,0.134823848,0.091779584,0.186924119,0.222560976,-0.087488708,0.085004517,0.272370553,0.024616079,0.111461786,0.253558342,-0.060365854,-0.196906052,0.047922313,0.132746161,-0.029132791,0.073915989,0.165763324,0.006278229,0.210817525,0.092276423,0.374887082,0.308536585,0.29098916,0.097244806,0.074480578,-0.217457091,0.305726658,-0.047866124,-0.112240289,0.179358627,0.072764228,0.204065041,0.329561879,0.213753388,0.159146341,0.366644083,-0.069534779,0.256504065,0.358335356,0.108966288,0.299751581,0.248172133,0.087330623,0.190311653,0.169715447,-0.028229449,-0.017479675,0.017344173,-0.119173442,0.058107498,0.07366757,0.288285643,0.036630533,0.219050469,0.098893406,0.204968383,0.313233966,0.294308943,0.033288166,0.251806685,0.046589883,0.271657633,-0.022109304,0.124345077,0.161246612,0.113731446,-0.020121951,-0.022086721,0.284439928,0.271047877,-0.049277326,-0.104832882,0.008468835,-0.063075881,0.388211382,0.180555556,0.332339657,0.226751506,0.298396567,0.380894309,0.387895212,0.147922313,0.244873532,0.383242999,0.268068361,0.220799458,0.118495935,0.183152665,0.147042384,0.350045167,0.324335616,-0.163121048,0.120415537,0.208152665,0.34200542,0.145889792,0.209981933,0.071183379,0.238030714,0.278635953,0.206775068,0.098870822,0.124855322,0.270934959,0.187646793,0.113098464,0.257791328,0.205601883,0.200790425,0.214905149,0.408175248,0.259778681,0.304652213,0.137240289,0.239477413,0.120822042,0.277551942,0.133852755,0.14498645,-0.053748871,0.266147245,0.323712737,0.284191509,0.299277326,0.339476061,0.260998193,0.371386631,0.308288166,0.309439928 +1Freq pDCfrequencyUnstim,0.312877946,-0.326082932,1,0.037054177,-0.132941176,-0.049321267,-0.098280543,0.103529412,0.071006475,0.176289593,0.0898055,0.265861838,0.187726358,-0.216973886,0.216781874,-0.169546851,-0.015793011,-0.184875922,0.121294433,0.270174511,0.030316742,0.371312217,0.346968326,0.333755656,-0.157104072,0.290226244,0.295837104,0.16898055,0.063078471,0.221395037,0.013445378,0.062619193,0.181943175,0.09704896,0.01863437,0.053890007,0.122904091,0.27123646,0.3120389,0.28365429,0.198759222,0.150100604,-0.023819591,0.1863284,0.182804792,-0.062156448,0.126568006,0.089738431,0.338732394,0.149329309,0.215291751,-0.104744818,0.389701111,0.275706734,-0.223827392,0.07260788,-0.163414634,0.206971975,-0.233082707,0.242096055,0.249514478,-0.033295425,0.019114688,-0.411418389,-0.078571429,-0.146881288,0.12082495,-0.322199866,0.191076896,0.078906774,-0.085177733,0.149061033,0.126324614,0.046143528,-0.114755198,0.132930919,0.193628437,0.257813548,-0.224884514,0.069181757,0.320892019,-0.152717304,0.209482342,-0.024433652,0.056119981,0.072327044,-0.0890179,-0.022779043,0.296161909,0.054023545,-0.034833091,-0.143554007,0.120034843,-0.061913783,0.35270108,0.056230492,0.091860744,-0.069585551,0.227951848,0.084948348,0.232946299,-0.014271892,0.319787131,-0.349749027,-0.032978552,0.211865261,-0.115826034,0.070220477,-0.14857258,-0.197379313,0.051144689,-0.13003663,-0.015659341,0.073443223,0.060805861,-0.26886181,0.052955665,-0.185377236,-0.22485092,0.274542125,-0.046956373,-0.003500384,0.35430718,0.03987023,-0.032698711,-0.033893964,0.330200146,0.002511952,0.18013127,-0.125030387,-0.034762175,-0.069929503,0.003160198,0.081273803,0.327092934,-0.218294567,0.130311355,0.077289377,0.03264652,-0.116437729,-0.085805861,0.346749084,-0.221492889,0.298397436,0.229441392,0.18992674,-0.033928571,-0.141993088,0.113623272,0.348982335,0.045362903,0.415370584,0.063556068,0.305059524,-0.075045788,0.119795918,0.268265306,-0.056822344,0.233287546,-0.082219033,-0.037445544,-0.271738309,-0.230792476,-0.302515361,0.170506912,-0.181135531,-0.031501832,-0.038278388,-0.354869997,-0.261355311,0.054532967,-0.16167798,-0.020184003,-0.006253659,-0.152106227,0.188235595,-0.049509764,-0.125140713,-0.052102747,-0.121465035,0.10380034,0.163763066,-0.152446175,0.199254517,-0.023741998,-0.000729276,0.083704724,0.170083462,-0.200388947,0.24284904,-0.071225995,-0.128757799,0.034275991,-0.067660643,-0.018393971,-0.040920509,0.198785425,0.061538462,-0.326315789,-0.181010453,0.11401021,-0.115792886,0.178137652,0.082246171,0.155497934,0.243497285,0.055668098,0.058747265,0.170083462,-0.288226238,-0.067822705,0.18240013,0.124544202,-0.02050077,-0.087432137,0.066040029,0.043690587,0.07641196,-0.074791346,0.120654728,0.004456689,-0.120330605,-0.31358756,-0.442243613,-0.42049241,-0.331762401,-0.44921634,0.068554856,0.038797658,-0.2488893,-0.090723272,0.208811551,-0.072195483,-0.16191534,0.05923732,-0.362797729,-0.326113785,-0.182432432,-0.180087622,-0.365176209,-0.266475379,0.109990127,-0.179563125,-0.360422066,0.064420585,-0.207639146,-0.145795892,0.079415031,-0.410854088,-0.175245089,0.325249907,0.352246082,-0.010644206,-0.161452548,0.224669875,0.113106257,-0.230377638,0.114000987,0.135289399,-0.468746143,-0.387387387,-0.274065161,-0.412532395,-0.16870295,-0.443385166,0.302388004,-0.222511898,-0.191010976,0.009564359,-0.166203875,-0.236116253,-0.492595335,-0.259533506,-0.070621992,-0.494569912,-0.360823152,-0.331266198,-0.346908552,-0.515276096,-0.278664373,-0.263050722,-0.343656236,-0.109681599,-0.163396273,-0.455417747,0.094255214,0.139361965,-0.426292731,0.197365173,-0.179717389,-0.20045045,-0.38499275,-0.274311983,-0.295631248,-0.361656177,0.064173763,-0.331327903,-0.322812539,-0.140441812,-0.165062323,-0.152073306,-0.411884487,0.158830063,-0.212853264,-0.248642478,-0.335325923,0.125293101,-0.106472911,-0.105146242,-0.264994447,-0.056614834,-0.409138591,0.009811181,0.233617179,-0.532210292,-0.315222757,-0.449401456,-0.502595509,-0.30099963,-0.213161792,-0.430797236,-0.196840676,-0.510891028,-0.537609527,-0.437187131,-0.232660743,-0.280760212,-0.38963964,-0.234128055,-0.501789461,-0.341110863,0.287270147,-0.077748982,-0.259996298,-0.448691843,-0.40133284,-0.469702579,-0.083950389,-0.098420338,-0.390596076,-0.173022337,-0.304516846,-0.154034833,-0.336017524,-0.408274713,-0.189065778,-0.181537702,-0.102647168,-0.347834136,-0.237813156,-0.54587807,-0.523448106,-0.286961619,-0.389948167,-0.152691539,-0.252992719,-0.138343823,-0.155405405,-0.288257436,-0.099654449,-0.405220289,-0.435332593,-0.174194743,-0.299950636,-0.472325065,-0.287671233,-0.508947303,-0.375971862,-0.421016907 +1Freq GranulocytesfrequencyUnstim,-0.296319737,0.117750678,0.037054177,1,-0.27890223,-0.020011435,-0.262435678,-0.408576329,-0.340329114,-0.107985516,0.305504622,0.16255896,0.208007183,-0.126835664,0.105944056,0.368138112,0.336363636,-0.035091491,0.300193477,-0.173208042,0.118415436,-0.112183044,-0.041065483,-0.116878682,0.319559464,-0.037650474,0.115939554,-0.097744361,-0.172467532,-0.007035054,-0.374736842,-0.30756086,0.207110656,0.201709683,-0.287327807,0.380598832,0.549391431,-0.431746808,-0.041942551,-0.064086179,-0.173953262,-0.08395813,0.234289238,-0.360962072,-0.260807894,0.226888683,-0.12599445,0.229771178,-0.294109056,-0.403164557,-0.278383642,-0.068092065,0.109311366,-0.099731541,0.071428571,-0.218292683,-0.263937282,0.048374506,0.221223339,0.113001422,-0.38398293,-0.175049787,0.288777994,0.021393353,0.383467471,-0.187894031,-0.070053655,0.101576123,0.413997155,-0.091382668,0.480111977,-0.052312561,-0.033179365,-0.090311587,0.519912366,0.2719815,0.38909445,-0.081864654,0.412648568,0.221713729,-0.242210321,-0.481339713,0.09924812,-0.107154944,-0.079289132,-0.140396446,-0.489063568,-0.018865668,-0.118318524,-0.173448773,-0.274603175,-0.003100775,-0.027906977,-0.137530122,0.116110305,0.088372843,0.055313659,-0.435680109,-0.208144796,0.326304107,-0.177306904,-0.024128503,-0.161107314,-0.164049283,0.086466165,-0.240092961,0.031488801,-0.034782609,-0.070786678,-0.007205564,-0.220094995,0.20189989,-0.047570332,-0.219802704,-0.190464012,-0.044203144,-0.051462672,-0.183426128,-0.079208834,-0.34175669,0.092279942,0.123953824,-0.224747475,-0.212626263,-0.107503608,-0.165223665,-0.212543554,-0.288226238,0.020824893,-0.367312211,0.040434325,-0.090997488,-0.217567458,-0.150798152,-0.065817764,0.000512282,-0.319729631,0.122981366,-0.120935331,0.093423456,-0.235732554,-0.198684691,-0.015995473,-0.286189258,-0.312056997,-0.185933504,-0.169419072,0.158831511,-0.185250219,-0.182017719,-0.087636683,-0.061537234,-0.109226594,-0.253252454,0.146460212,-0.290920819,-0.402838252,0.016872161,-0.144978433,-0.076276095,0.123164866,-0.046259223,-0.097580016,0.10204326,0.163380956,-0.113737669,0.044976251,-0.07194008,0.041350719,-0.087248813,0.199013518,0.061150856,0.081181601,0.066431591,-0.262842528,0.103332638,-0.279633545,-0.146908678,-0.409302326,-0.117829457,0.042887345,-0.344306856,-0.250314007,-0.399718111,-0.43551797,-0.359548978,-0.466525722,-0.335588443,0.069203665,-0.141649049,-0.402959831,-0.194080338,-0.412403101,-0.328118393,-0.190979563,-0.027202255,-0.025261324,-0.261672474,0.110452962,-0.031845069,-0.075264271,-0.1602537,-0.038501742,-0.41987315,-0.261028894,-0.251585624,-0.06483439,-0.436504581,0.277237491,0.107399577,-0.457787174,-0.353911205,-0.360958421,-0.406624383,0.167019027,-0.313742072,0.292319961,-0.294573643,-0.321212121,0.183509514,-0.422269204,0.084143763,0.349728997,0.098396567,0.262240289,0.243000469,0.299209575,-0.294738031,-0.055341324,0.328139115,0.013188873,-0.054539295,0.091824752,0.48934056,0.093812105,0.414927733,0.235162602,0.412759711,-0.248261066,0.294942948,0.28134598,0.077484192,0.020189702,0.483062331,-0.007407407,0.1967028,0.407748463,-0.049345077,0.06181155,0.455151621,-0.187240289,-0.288888889,-0.029494128,0.168518519,-0.32068654,0.124480578,0.16104336,-0.094963866,0.152122855,0.327551942,0.216892502,0.222831978,0.476738934,0.161088528,0.440808491,-0.424887082,0.151988211,-0.021736798,-0.048961156,0.114611563,0.198622403,0.273825655,0.371544715,0.088053297,0.193563686,0.253161698,0.037285456,0.116892502,0.267492477,-0.085580881,0.111901536,0.286361147,-0.098735321,0.348419151,0.195189702,-0.075112918,0.15070009,0.268224932,-0.388911472,-0.060749774,0.177416441,0.093339958,0.143450768,0.438450985,0.242795845,0.051671183,0.316327913,0.267886179,0.053161698,0.232655827,0.047402891,0.383897922,-0.078952123,0.288527552,0.251806685,0.365020128,0.066034327,-0.07265131,0.062962963,0.514250226,-0.029200542,-0.110749774,-0.018179765,-0.028026197,0.419715447,0.175722674,0.481233062,0.461746058,0.188346883,0.240831075,0.425835592,-0.175248419,0.126490515,0.409101174,0.372901834,0.269241192,0.172538392,0.361653117,0.136032837,0.34397019,0.265358318,-0.138075881,-0.016282746,0.461969286,0.128974706,0.227190605,0.366418248,0.055103884,0.361495032,0.309236676,0.15203252,0.141169828,-0.068440635,0.239182475,0.190605239,0.081052394,-0.057746161,0.175610748,0.339679313,0.255420054,0.295167118,0.395799458,0.420189702,0.341237579,0.284328615,0.309214092,0.223690154,0.313482385,0.255984643,0.150564589,0.436224029,0.359914182,0.090198735,0.303884372,0.323554652,0.393744354,0.426693767,0.4217028,0.405691057 +2Unstim BasophilsCREBUnstim,0.215627978,-0.076882028,-0.132941176,-0.27890223,1,0.494110921,0.457137412,0.304898037,0.417636748,0.051648561,-0.214331999,0.014979989,-0.172327044,0.2386679,0.457909343,0.351295097,0.058163737,0.21113017,-0.151591386,0.49433395,-0.022128852,0.559383754,0.483193277,0.388795518,-0.379831933,0.262464986,0.418487395,0.088856636,0.46186099,0.249704593,0.270821422,0.297350867,0.481160304,0.321821994,0.176876794,0.355212502,-0.059805603,0.621926583,0.044330093,-0.148694492,0.372670097,0.327768249,-0.22456446,0.778571429,0.422648084,-0.007491289,0.62630662,-0.103792643,0.535658472,0.379988565,0.495635601,0.258103532,0.421786809,0.470825446,0.256451613,0.145967742,0.404032258,-0.316337286,-0.141238472,-0.043541364,0.650540235,0.521528786,0.001867734,0.222414617,-0.171040724,0.17800905,0.441176471,0.272760181,0.189646831,0.327310844,-0.04936154,0.387383267,0.212197446,-0.055993901,-0.096779112,0.465751858,0.100209644,0.395082905,0.104783686,-0.158604917,0.572936916,0.36535726,-0.430134588,0.144897395,0.005142795,-0.366232629,0.508042455,0.418777699,-0.531239742,-0.090710143,-0.058102637,0.302785924,0.086143695,0.154564803,0.171025276,-0.569099464,-0.120253857,0.214356056,0.141312741,-0.2996139,-0.106904475,-0.121566911,-0.212605318,0.324232643,0.014990699,0.242805559,-0.479051383,-0.353096179,-0.439501885,-0.603517537,-0.525727312,-0.112353452,-0.167281806,0.141663048,-0.128962223,-0.458876914,0.026497042,-0.021473138,-0.096831699,0.170755536,-0.309109312,-0.351214575,0.353441296,-0.02854251,-0.191497976,-0.044736842,-0.261904762,-0.044566545,0.200854701,-0.127594628,-0.117826618,0.211233211,-0.01037851,-0.025641026,-0.009878419,-0.098226214,0.294398611,-0.281914894,-0.465371255,-0.125488493,-0.378093791,0.128310899,-0.083143012,0.235453756,0.208098133,0.127116804,-0.196157186,-0.187907078,-0.330655667,0.273447677,-0.317303517,0.113004776,-0.29038211,0.098784195,-0.460811984,-0.002814259,-0.051782364,-0.459075119,0.302214503,0.387351779,0.007641634,-0.335573123,0.240316206,-0.437472861,-0.244789405,-0.239687364,-0.177377334,-0.276921407,-0.074095017,-0.268345636,-0.068280504,-0.156445001,-0.135728761,0.127487746,-0.302865827,0.08594416,0.01953602,-0.021538462,-0.065323565,0.120879121,0.59035409,0.010989011,0.064615385,0.123931624,-0.061660562,-0.062271062,0.152625153,0.110500611,-0.365689866,-0.115995116,-0.123321123,-0.332722833,-0.068986569,-0.520757021,-0.144078144,-0.285714286,0.099487179,-0.217777778,0.02017094,-0.12957265,-0.094017094,-0.349206349,-0.103478261,-0.116605617,0.091575092,-0.431013431,-0.242979243,-0.016483516,0.047008547,0.053724054,-0.215506716,0.137362637,-0.092796093,0.074481074,-0.166666667,-0.150793651,0.218580174,-0.028083028,-0.308302808,-0.26007326,-0.063492063,-0.155067155,-0.24833238,-0.017876882,0.044711264,-0.027292306,-0.055231561,0.06163522,-0.126281685,-0.026033924,0.114963121,-0.013074138,0.002248904,-0.262740614,0.059653135,0.062854965,-0.204497808,-0.156165428,-0.023518201,-0.384226877,0.129788451,-0.209376787,-0.072994092,-0.083056985,-0.076958262,-0.20434534,-0.356553393,0.048827902,-0.086755989,-0.203354298,-0.06918239,-0.058662093,0.062245092,-0.19245283,0.076119687,-0.11637126,-0.063312369,0.084200496,-0.243682104,-0.115227749,-0.159214789,-0.189022298,-0.154183343,0.039908519,-0.199923766,0.109281494,-0.114277002,0.263851036,0.056070135,0.217228893,-0.046540881,-0.152506194,0.030226796,0.034343434,0.094187155,-0.106537069,0.06445588,0.125824281,-0.027101717,0.049362481,0.070478369,-0.110120069,0.082294645,-0.064379646,0.093043644,0.071316943,0.015437393,0.106613303,0.088850772,-0.290642272,-0.042271774,-0.259467495,0.102801601,-0.126360327,-0.209910425,-0.101200686,-0.249170955,-0.093882218,0.0739089,-0.165237278,-0.001943968,-0.012235563,0.121174004,0.054011816,-0.019401563,-0.312108102,0.036249285,0.297045931,-0.232170764,-0.072841624,0.191385554,0.22302268,-0.062778731,-0.462473795,-0.121021536,0.153649705,-0.043415285,-0.062779927,0.136878216,0.006365542,-0.003468649,-0.09731275,0.035410711,-0.159138555,-0.249861823,0.092052601,-0.268763103,-0.121402706,-0.05370688,-0.067047837,-0.118124643,-0.055917667,-0.135582237,-0.049818944,-0.098989899,-0.082523347,-0.047379455,-0.000266819,-0.128035068,0.107299409,-0.152734896,-0.020087669,-0.095029351,-0.034343434,0.060034305,-0.058128454,-0.059958071,-0.044787498,-0.09220507,0.014675052,-0.116599962,-0.14129979,-0.003392415,0.087402325,-0.094951305,0.107909281,-0.0378502,-0.039832285,-0.419020393,-0.49731275,-0.00933867,-0.212502382,0.178120831,0.087173623,0.01726701,-0.135048599,-0.057366114,-0.23095102,-0.044939966 +2Unstim BasophilsSTAT5Unstim,0.17301315,-0.292243187,-0.049321267,-0.020011435,0.494110921,1,0.452639604,0.453554412,0.478002333,0.590165809,0.085953878,0.417800648,-0.05767105,-0.121415356,0.437442183,0.633209991,-0.133556892,0.011473223,-0.000495521,0.365633673,-0.320448179,0.589355742,0.531652661,0.582633053,-0.34929972,0.482633053,0.607563025,0.002418965,0.086357039,0.468267581,0.216923957,0.569658853,0.515313804,0.579645512,-0.255446118,0.58452449,0.157690109,0.431174476,0.328301887,-0.123765962,0.249857061,0.14488279,-0.036236934,0.48815331,0.508710801,0.090766551,0.623867596,0.195578426,0.4810749,0.273260911,0.417571946,-0.112239961,0.399209805,0.385701044,-0.007258065,0.415725806,0.459677419,-0.13715415,-0.019104084,0.325270118,0.596839219,0.569424286,0.272955975,-0.198821063,0.147239819,-0.05438914,0.708144796,0.207692308,0.579342042,0.381360778,0.070859539,0.196569468,0.553344768,0.264875167,0.344920907,0.657861635,0.61113017,0.681646655,0.182008767,0.126891557,0.58376215,0.466243572,-0.073859284,0.307651062,0.169712222,-0.383521173,0.458146406,0.132516277,-0.182186235,-0.056351898,-0.107779845,0.190249267,-0.060117302,0.073358571,0.111062479,-0.643943539,-0.0791115,0.505197505,0.093951094,-0.223423423,0.103184156,0.100995733,-0.027683554,0.319636702,-0.252434621,0.339533866,-0.267457181,-0.180500659,-0.237811654,-0.390077084,-0.264871906,0.077507599,-0.063721233,0.088580113,-0.16337386,-0.182886314,0.046916781,-0.165221619,0.052750993,0.262158055,-0.220647773,-0.22145749,0.184615385,-0.063562753,-0.523279352,-0.118825911,-0.106837607,-0.080586081,-0.027472527,0.008547009,-0.105006105,0.135531136,-0.037240537,-0.068986569,0.005970473,-0.135604332,0.229483283,-0.328267477,-0.230568823,-0.291901867,-0.25054277,0.305579679,-0.198294132,0.306990881,0.21819366,0.042336083,-0.200607903,-0.12592271,-0.171841077,0.278549718,-0.266283109,0.247611811,-0.367998263,0.265740339,-0.267151541,-0.087617261,-0.133020638,-0.158271819,0.302757273,0.127799736,-0.146772069,-0.44743083,0.095783926,-0.192683456,0.175531915,-0.168150239,-0.020625271,-0.393182805,-0.038928508,-0.261940947,0.165653495,0.093587634,-0.061105221,0.010432423,-0.095636127,0.262585151,0.052503053,0.007863248,0.01037851,0.316239316,0.527472527,0.084249084,0.002393162,0.344322344,0.198412698,0.133699634,0.31990232,0.184371184,-0.399267399,0.082417582,0.125763126,-0.071428571,0.203907204,-0.222222222,-0.299145299,-0.098290598,0.151452991,0.054358974,0.007863248,-0.075555556,0.057387057,-0.161782662,0.056521739,0.036019536,0.308913309,-0.207570208,-0.296092796,0.158730159,-0.032356532,0.014652015,0.010989011,0.386446886,0.142857143,-0.042124542,-0.382783883,0.054945055,0.103255665,-0.033577534,-0.054945055,-0.351037851,0.185592186,-0.250915751,-0.148618258,-0.018715456,-0.046388412,-0.022908765,-0.294682676,-0.212426148,-0.141604727,0.035258243,-0.230154949,-0.206022489,0.146255003,-0.06559939,0.039374881,-0.041585668,0.026033924,-0.024433009,-0.273413379,-0.137605062,0.063998475,-0.027406137,-0.241928721,-0.020392605,-0.175605108,-0.010024776,-0.164401837,-0.104326282,-0.03297185,-0.139470173,-0.368553459,-0.254278635,-0.042805413,-0.214865638,-0.065828092,0.099371069,-0.036630455,-0.201677149,-0.15242996,0.047150753,-0.191842958,-0.321593291,-0.113245664,-0.205260149,-0.143434343,-0.203430532,-0.288360746,0.321751892,-0.107833047,-0.010634648,0.071012007,-0.029388222,-0.072307986,-0.098761197,0.103106537,-0.078559177,0.013074138,-0.156622832,-0.159522766,-0.288436982,0.126586621,-0.146712407,0.051038689,-0.101200686,0.008271393,0.05294454,-0.066361731,0.208157042,-0.062854965,-0.159138555,-0.148770726,-0.359031047,0.032285115,-0.160704416,-0.345683248,-0.182466171,-0.264036592,-0.170040023,0.041356966,-0.184295788,-0.106003431,-0.181398895,0.0450162,0.07787307,-0.050581285,-0.048485773,-0.21982085,0.110653707,-0.34232895,0.016809605,0.028473413,0.10089575,-0.063007433,-0.472536688,-0.094949495,0.027253669,0.009872308,0.04311117,-0.062473795,-0.197026872,-0.029083286,-0.170954831,-0.10920526,-0.097541452,-0.216508796,0.184295788,-0.106003431,0.116981132,0.214865638,-0.053173242,-0.169277682,-0.068267581,-0.051038689,-0.223861254,-0.176748618,-0.209071851,-0.161654279,-0.231637126,-0.077415666,0.036554221,-0.011473223,0.016123499,-0.350956774,0.021459882,-0.006670478,-0.027787307,-0.068724986,-0.009109968,-0.117591004,-0.109281494,-0.106689537,-0.142595769,-0.093043644,-0.085115304,-0.141988603,0.189479703,-0.074290071,0.064303411,-0.269296741,-0.255727082,-0.027711073,-0.196340766,-0.014293882,0.035715647,0.008652563,-0.186735277,0.007661521,0.023060797,-0.093196112 +2Unstim BasophilsS6Unstim,0.194282447,-0.099447303,-0.098280543,-0.262435678,0.457137412,0.452639604,1,0.544044216,0.41946832,0.3260911,0.114846579,0.214255765,-0.12391843,0.109967623,0.182354302,0.122109158,-0.030758557,0.091442729,0.050657519,0.527983349,-0.152661064,0.342577031,0.083193277,0.014565826,-0.114285714,0.020728291,0.072829132,-0.018384132,0.014916949,0.175223937,0.49693158,0.486335049,0.382016048,0.205183915,-0.019508188,0.387459501,0.214636935,0.209164031,0.015056223,0.030836669,0.170954831,-0.002401372,0.201916376,0.578397213,0.548780488,-0.020209059,0.417073171,0.251229274,0.515608919,0.627063084,0.321364589,-0.04039671,0.289388808,0.15286907,-0.004032258,0.2125,0.020967742,-0.082081686,-0.053359684,0.238509918,0.48056765,0.417674569,0.174766533,-0.008711466,0.197918552,-0.047873303,0.394208145,-0.103076923,0.166747299,0.346750524,-0.10165809,0.098913665,0.14129979,-0.109052792,-0.093805984,0.295063846,0.163407662,0.365504098,-0.116523728,0.159900896,0.563331427,0.02352555,0.014771857,0.212228052,-0.114126272,-0.321807638,0.42094321,0.187120425,-0.394025605,-0.407593829,0.111062479,0.348240469,-0.030058651,0.174127089,-0.022650181,-0.662545136,-0.137761243,0.327278696,0.171171171,-0.308365508,-0.074734654,0.096618886,0.068825911,0.237675768,-0.148046832,0.189845716,-0.434782609,-0.201185771,-0.174562305,-0.221691457,-0.200499349,-0.032240556,-0.08662614,0.155775076,0.09411637,0.003160198,-0.074467223,0.002025768,0.013370067,0.294181502,-0.02145749,-0.370040486,0.17854251,-0.014979757,-0.213157895,0.147165992,-0.197802198,0.103785104,0.147130647,0.326007326,0.075091575,0.180708181,0.238095238,0.043345543,-0.23957881,0.215902089,-0.029960921,-0.271710812,-0.366261398,0.022145028,-0.053842814,0.110616587,0.242402866,0.177377334,0.241858446,0.053082935,0.096396005,0.15881459,-0.321320017,0.219062093,-0.063178463,0.077833261,-0.085323491,0.256838906,-0.392748589,0.114634146,0.029268293,-0.003690838,0.309921841,-0.007641634,0.140843215,-0.134123847,0.037417655,-0.190729483,0.032132002,-0.134715588,0.192574902,-0.008901433,0.217977834,-0.095527573,-0.235670864,0.162159307,0.270608838,0.193265057,-0.09813287,0.274070776,-0.184371184,0.220512821,-0.179487179,0.023199023,0.183150183,0.401098901,-0.301196581,0.486568987,0.209401709,-0.169108669,0.033577534,0.134920635,-0.174603175,0.176434676,0.034188034,-0.036019536,0.136141636,-0.147741148,-0.175213675,0.199023199,0.062564103,0.272478632,-0.044102564,-0.177435897,0.246031746,-0.114774115,0.074782609,0.012820513,0.266788767,0.032967033,-0.298534799,0.235042735,0.26007326,-0.236263736,0.169108669,0.380952381,0.283882784,0.1001221,-0.262515263,-0.033577534,-0.069731098,0.104395604,0.228327228,-0.410866911,0.277777778,-0.498778999,-0.298418144,-0.108976558,-0.284772251,-0.1613143,-0.327692014,0.022908329,-0.083133219,-0.152506194,-0.09891555,0.099599771,-0.036325519,-0.290337336,0.128797408,-0.13657328,-0.110196303,-0.129483514,-0.307261292,-0.164935486,0.06559939,-0.159214789,-0.278139889,-0.297960739,-0.163712598,-0.071393177,-0.159751472,-0.03670669,-0.03956622,-0.122393749,-0.212578616,-0.280198209,0.009186202,-0.368400991,0.114236707,0.171031065,-0.268229464,-0.014903755,-0.169887555,-0.120564132,-0.311987803,-0.372365161,-0.382275586,-0.174080427,-0.355517439,-0.078787879,-0.142331663,0.152890278,-0.113703068,0.096397942,-0.071926815,-0.105698494,-0.205260149,-0.150905279,-0.041966838,-0.347665333,0.012159329,-0.237278445,-0.347100193,-0.206750653,-0.10127692,-0.388755479,0.148923194,-0.24718887,-0.063236135,0.162187917,0.026491328,0.088317134,-0.063617305,-0.228206594,-0.382504288,-0.533762793,0.002477606,-0.305399379,-0.221040595,-0.106384601,-0.341719078,-0.260377358,0.103792643,-0.329597865,-0.245587955,-0.248408614,0.004307223,0.024280541,0.037621498,-0.202595819,-0.018867925,0.026948733,-0.162492853,-0.178502001,0.230798552,0.026033924,0.012235563,-0.454240518,-0.289422527,-0.162035449,-0.243072232,-0.3432503,-0.17941681,-0.254278635,-0.214789403,-0.288888889,-0.323422908,-0.303754526,-0.483142427,-0.056222603,-0.148694492,-0.119039451,-0.120259196,-0.257861635,-0.393177054,-0.029616924,-0.126357919,-0.253211359,-0.304516867,-0.392262245,-0.11751477,-0.13222794,-0.173318087,-0.245816657,-0.401105394,-0.263579188,-0.301478997,-0.194434915,-0.017953116,0.072079283,0.192605298,-0.135124833,-0.185591767,-0.158299981,-0.192147894,-0.312826377,-0.231484658,-0.277987421,-0.312413044,0.034419668,-0.226376977,0.035639413,-0.315494568,-0.069792262,-0.070478369,-0.488012197,0.099676005,-0.070402135,-0.161501811,-0.283018868,-0.139165237,-0.347360396,-0.132380408 +2Unstim BasophilsP38Unstim,0.429845626,-0.298646846,0.103529412,-0.408576329,0.304898037,0.453554412,0.544044216,1,0.407486782,0.669677911,0.150295407,0.372441395,-0.270821422,0.139454209,0.113205365,0.165703053,-0.252312673,0.210444063,-0.286296932,0.562095282,-0.092717087,0.53697479,0.377310924,0.222128852,-0.222408964,0.234453782,0.27394958,-0.069746815,0.025560393,0.470173432,0.268076996,0.56432247,0.278755074,0.236668573,0.156333791,0.210977702,0.044939966,0.399839897,0.362454736,0.18848866,0.008271393,0.059043263,0.165853659,0.579965157,0.690766551,0.016550523,0.447735192,0.284391081,0.59679817,0.524985706,0.500971984,0.099983874,0.326560232,0.316706325,0.135483871,0.217741935,0.323790323,0.033465086,-0.051646904,0.261570714,0.522173843,0.312207708,0.262359443,-0.001088933,0.107420814,0.023167421,0.473122172,-0.192217195,0.1963393,0.250009529,-0.119649323,-0.000190585,0.284467315,0.165770917,-0.058890795,0.387688203,0.310691824,0.531694302,-0.183685916,0.174385363,0.569658853,0.190502243,-0.014553015,0.35234861,0.229018492,-0.484845169,0.515701937,-0.052415604,-0.230769231,-0.024619761,0.002735529,0.130865103,-0.193548387,0.232245275,0.041689463,-0.467118941,-0.239522924,0.513951198,0.323294723,-0.177091377,-0.0263705,0.230112704,0.065105591,0.252448433,-0.250246198,0.276069592,-0.386561265,-0.167720685,-0.066198036,-0.328737381,-0.172166739,0.091836735,-0.102692141,0.23501954,0.007815892,-0.157118548,-0.030872701,-0.123571834,0.049671826,0.223404255,-0.027327935,-0.334817814,0.233805668,0.011336032,-0.342105263,0.008502024,-0.136752137,-0.186202686,-0.153846154,0.141025641,-0.134920635,0.19047619,-0.055555556,0.105006105,-0.202887538,-0.091054832,-0.037451151,-0.349544073,-0.164025185,-0.216999566,-0.182262267,0.169995658,0.055317148,0.224381242,0.227746418,0.094550586,-0.139600521,0.075445072,-0.243269648,0.155123752,-0.106708641,0.129722102,-0.231980026,0.2967868,-0.367564047,0.029643527,0.100375235,-0.167933131,0.28582284,-0.146640316,0.022002635,-0.311462451,-0.131488801,-0.025075988,0.152192792,-0.157837603,0.074576639,-0.17032132,-0.086635012,-0.370929223,-0.094224924,0.097651141,0.058013588,0.206880253,-0.062092922,0.269318104,-0.004884005,0.247863248,-0.083638584,0.30952381,0.324786325,0.383394383,-0.032478632,0.293040293,0.108058608,-0.16971917,0.047619048,0.000610501,-0.263125763,-0.178266178,0.285714286,0.119047619,0.178876679,-0.054334554,-0.322344322,0.035409035,0.158290598,-0.020854701,-0.074871795,-0.269059829,-0.057997558,0.148962149,0.293913043,0.015262515,0.256410256,-0.137973138,-0.401098901,0.108058608,-0.092185592,-0.342490842,0.086691087,0.260683761,0.247863248,0.015262515,-0.453601954,0.084859585,0.066378642,0.152625153,0.075091575,-0.384004884,0.262515263,-0.548229548,-0.299409186,-0.169506385,-0.294453974,-0.179267758,-0.274023251,0.038307604,-0.175223937,0.000114351,-0.014827803,0.01841052,-0.195349724,-0.273947017,0.13733562,-0.307718696,-0.127272727,-0.062626263,-0.250543168,-0.153728869,-0.04898037,-0.131694302,-0.077568134,-0.263198018,-0.179721746,-0.034190966,-0.099792258,-0.051267391,-0.101812499,-0.187497618,-0.088317134,-0.141757195,-0.037392796,-0.4089575,0.066209262,0.112940728,-0.217991233,0.117972175,-0.115456451,-0.226300743,-0.237430913,-0.336611397,-0.249475891,-0.173851725,-0.404078521,0.054164284,-0.030379843,0.076731022,-0.076882028,-0.067810177,-0.066056794,-0.191233086,-0.174537831,-0.25115304,-0.247265104,-0.164398704,-0.075509815,-0.194129979,-0.232556367,-0.135508586,-0.131999238,-0.136878216,-0.134819897,-0.187802554,-0.232323232,0.018639222,0.040747094,-0.003621117,0.108290452,-0.030760435,-0.154716981,-0.377861213,-0.045549838,-0.485734434,-0.311911569,-0.144806556,-0.263579188,-0.328378121,0.088545836,-0.157766343,0.044863732,-0.272651039,0.056680008,0.104326282,-0.058585859,-0.233547428,-0.12109777,-0.082065942,-0.053859348,-0.369620736,0.09334858,-0.071316943,-0.135201067,-0.173927959,-0.280503145,-0.158376215,-0.210977702,-0.312870457,-0.313131313,-0.197103107,-0.136115876,0.134819897,-0.333638269,-0.235067658,-0.44605386,-0.048294263,-0.269220507,-0.043034115,0.091214027,-0.146636173,-0.30268725,0.062245092,-0.125290642,-0.309243377,-0.370154374,-0.329292929,-0.30268725,-0.11278826,0.02199352,-0.084505432,-0.047455689,-0.33912712,-0.062056873,-0.101505622,-0.107070707,0.037240328,-0.012388031,0.139851344,-0.160129598,-0.199466362,-0.225538403,-0.131618067,-0.345835716,-0.396302649,-0.020812289,-0.094568325,-0.151438917,0.138479131,-0.370688012,-0.159062321,-0.147246045,-0.386544692,0.073832666,-0.103182771,-0.106308367,-0.184753192,-0.2396417,-0.318772632,-0.211206404 +2Unstim BasophilsZap70_SykUnstim,0.326592318,-0.497615425,0.071006475,-0.340329114,0.417636748,0.478002333,0.41946832,0.407486782,1,0.324531799,0.053764292,0.217461108,-0.274850388,-0.161644793,0.447631735,0.204904294,-0.348273683,0.028045958,-0.234174209,0.431519161,-0.143697479,0.645098039,0.475630252,0.496638655,-0.265266106,0.281512605,0.580392157,0.359138105,0.259364066,0.339871221,0.307666066,0.525241648,0.580886741,0.290647703,0.116713843,0.333308085,0.112069358,0.418584446,0.177090192,0.075514219,0.24508733,0.220895307,0.105125531,0.495006029,0.393631866,-0.07973837,0.644799007,0.182355963,0.478612858,0.355668536,0.391994729,-0.076009137,0.575676419,0.586526314,-0.386290323,0.218951613,0.24233871,0.146647558,-0.468189817,0.25508573,0.600743038,0.512916052,0.165871808,-0.092587299,0.073406567,-0.074131123,0.563070481,0.088893948,0.129641671,0.229290015,-0.044530113,0.395810506,0.238218932,0.175182303,0.043843273,0.335826498,0.145800823,0.41504202,-0.090243116,0.063837942,0.497615425,0.25868578,0.160857909,0.270759122,0.131750287,-0.110302566,0.221042841,0.094769096,-0.016304645,0.060294359,-0.136674509,0.015763908,-0.204380901,0.09002556,0.001094271,-0.413744051,-0.164031296,0.369097774,0.41559946,-0.101164811,0.296109865,0.103846364,0.033703562,0.277303568,-0.010176725,0.158997648,-0.255486011,-0.090807254,-0.419531146,-0.445253138,-0.217117443,-0.157814729,0.169110484,0.400564847,0.04018682,-0.145149528,-0.061188104,0.085014994,-0.251641141,0.3626589,-0.100304003,-0.385005263,0.086930136,0.304356792,-0.145896731,-0.089159114,-0.065934066,-0.022588523,0.212454212,0.002442002,0.07020757,0.236263736,0.208791209,0.045787546,-0.027370483,-0.305927783,0.292386271,-0.187900538,-0.092429687,0.10079289,-0.145324231,0.303247574,-0.232212737,0.295427436,0.327251053,0.36624313,-0.188660829,-0.105789089,0.056804614,0.261757397,0.032583908,0.491365338,0.050505058,0.525578442,-0.253176968,0.07492961,0.293145669,-0.211578178,0.449223483,0.005601318,-0.117298194,-0.19261945,0.153212529,-0.244379313,-0.005539264,-0.311393551,-0.175518653,-0.200282423,-0.214823162,-0.134571542,0.14097971,-0.163136769,-0.195970454,0.031491187,-0.107201058,-0.017033009,0.104411541,0.171311338,0.066860022,0.437185168,0.21309724,0.492749204,0.173704909,0.597466043,0.293695622,0.010990689,0.368493364,0.402381321,-0.099832088,0.344069612,-0.208823083,0.203938332,0.393833007,0.160586172,-0.036635629,0.029003206,0.040348778,0.249957261,-0.078303985,-0.066678066,0.163944438,-0.253701728,-0.262608696,0.315066406,0.428331557,0.23477332,-0.235994507,0.234468023,-0.163639141,-0.223172037,0.339184861,0.597160746,0.540070225,0.325751797,-0.222561444,0.296137998,-0.045600371,0.29094795,0.226225006,-0.274767214,0.57701115,-0.321172344,-0.262334641,-0.073682646,-0.310260794,-0.157060358,-0.486244411,0.214561118,0.023009133,-0.112374621,-0.08787901,0.12229564,0.001488153,-0.482047057,0.081237884,-0.296829261,-0.171366527,-0.236082098,-0.018888094,-0.181710744,-0.215476904,-0.111153572,-0.196703283,-0.293700324,-0.079177364,-0.019956512,-0.393833808,-0.019193356,-0.090435629,-0.280192475,-0.135803489,-0.129545615,-0.018582832,-0.098103616,0.149006076,-0.085664184,-0.149998178,0.069714238,-0.315679197,-0.023009133,-0.274850388,-0.448620854,-0.447934014,-0.095661519,-0.376960569,0.062464263,-0.282563642,0.205826912,0.089785223,0.017514415,-0.12519563,-0.147632396,-0.307895013,-0.116495659,-0.004693405,-0.352921177,0.18121123,-0.274239863,-0.298132307,-0.04456912,-0.074522117,-0.277445116,0.133971916,-0.395505243,0.090624694,-0.060785321,-0.026214385,-0.215705851,0.22784002,-0.271110927,-0.041935385,-0.415164405,0.073377384,-0.199988663,-0.21776637,-0.262029378,-0.290952965,-0.288205606,-0.09940098,-0.358644842,0.035219618,-0.423360413,-0.073301068,-0.147708712,-0.134124547,-0.295155944,-0.099019402,0.103369388,-0.27446881,-0.312321314,-0.066203724,0.065287937,-0.007135502,-0.307895013,-0.339489643,0.012248643,-0.343381735,-0.314540459,-0.117487761,-0.378028987,-0.318502872,-0.113366722,-0.352234337,-0.4409893,-0.547383584,-0.187774366,-0.305529232,-0.267905674,-0.014461793,-0.261571485,-0.400771015,-0.123669319,-0.120006174,-0.241195238,-0.198229594,-0.255160981,-0.356508007,-0.351929075,-0.302705557,-0.193040138,-0.354142225,-0.012859167,-0.078722472,-0.13526928,-0.185942793,-0.059030064,0.172587575,-0.274392495,-0.270805665,-0.195405919,-0.275918805,-0.409547301,-0.198534856,-0.269737247,-0.328124883,-0.190674356,-0.069027399,-0.257374131,-0.282024048,-0.066890564,-0.332926507,-0.431678806,0.036135404,-0.206929564,-0.230892641,-0.400771015,-0.353531701,-0.330713357,-0.333002823 +2Unstim Basophils4EBP1Unstim,0.168591576,-0.258395273,0.176289593,-0.107985516,0.051648561,0.590165809,0.3260911,0.669677911,0.324531799,1,0.314884696,0.502877835,-0.170802363,-0.011910268,0.289777983,0.531567993,-0.215425532,0.023594435,-0.077339432,0.296831637,-0.099159664,0.562184874,0.41092437,0.469747899,-0.261344538,0.462745098,0.515966387,-0.098451863,-0.141428802,0.427024967,0.146712407,0.559291023,0.229087652,0.448904136,-0.190176089,0.198475319,0.095635601,0.308733276,0.414141414,0.094797027,-0.12544311,0.038078902,0.244076655,0.304878049,0.510278746,-0.061672474,0.444947735,0.417038308,0.388374309,0.252601487,0.365656566,-0.006611837,0.326318336,0.213395706,0.050806452,0.349193548,0.278225806,0.225032938,-0.053359684,0.460328979,0.299064667,0.326318336,0.391881075,-0.226316628,0.292036199,-0.091402715,0.529773756,-0.268506787,0.404934688,0.182313703,-0.09769392,-0.172708214,0.391499905,0.257175529,0.315951973,0.446845817,0.55715647,0.654964742,0.063998475,0.343624929,0.383495331,0.186781924,0.06401138,0.332558856,0.272786957,-0.4312288,0.1732137,-0.058981233,-0.152423679,-0.088302878,-0.266659372,-0.132331378,-0.204545455,0.147627016,0.034686508,-0.407374986,-0.204070467,0.379800853,0.295238095,0.037065637,0.12331765,0.139730824,0.058540322,0.291623352,-0.301455301,0.151329467,-0.297760211,-0.098418972,0.148908723,-0.233850832,-0.044615719,0.133630048,-0.085649153,0.128962223,-0.146113765,-0.14139859,-0.095211085,-0.226967021,0.049185641,0.119192358,0.067813765,-0.127530364,0.003643725,0.00242915,-0.422064777,-0.129352227,-0.035409035,-0.25030525,-0.092185592,0.027472527,-0.252136752,0.307081807,-0.128205128,-0.114163614,-0.060030395,-0.136038961,-0.081524099,-0.271168042,-0.062961355,-0.215805471,-0.25,0.140251845,-0.210840865,0.136235345,0.108445506,-0.00293096,-0.197568389,0.071971342,-0.176291793,0.017585758,-0.145028224,0.100412505,-0.323925315,0.233825445,-0.126682588,-0.127016886,-0.058724203,-0.02485888,0.142422927,-0.348221344,-0.143083004,-0.401581028,-0.265744401,0.141337386,0.419344333,-0.134607034,0.034520191,-0.327616153,-0.18139376,-0.204732957,0.060138949,0.224762704,0.044252789,0.049509804,0.068606166,0.216642651,-0.117826618,0.335384615,-0.047619048,0.380952381,0.399267399,0.271062271,-0.275213675,0.333333333,0.095238095,0.169108669,0.201465201,0.024420024,-0.344322344,-0.052503053,0.031135531,0.228937729,0.111721612,0.030525031,-0.473137973,0.227106227,0.126837607,0.04957265,-0.077606838,-0.284786325,-0.079365079,0.158119658,0.104347826,-0.001221001,0.476190476,-0.036019536,-0.370573871,0.107448107,0.03968254,-0.167887668,0.0995116,0.405982906,0.346153846,-0.114774115,-0.492063492,0.121489621,0.081129451,-0.017094017,0.070818071,-0.536630037,0.247252747,-0.423076923,-0.293615399,-0.320983419,-0.251229274,-0.168442318,-0.221879169,-0.074290071,-0.066056794,0.04936154,-0.333606511,-0.029464456,0.050047646,-0.008042691,0.2360587,-0.347360396,-0.06994473,0.022908329,-0.237202211,-0.013150622,-0.046083476,-0.004307223,-0.14053745,-0.280274443,-0.050657519,-0.006975415,-0.017229221,-0.119801791,-0.040061751,-0.080922432,-0.12902611,-0.0810749,0.09731275,-0.237659615,-0.100743282,0.182313703,-0.099142367,0.04219554,0.089918048,-0.146712407,-0.129712217,-0.286220698,-0.14884696,-0.190242043,-0.146864875,-0.007204117,-0.000914826,0.261640207,0.000419287,-0.027101201,0.101810558,-0.221040595,-0.108900324,-0.24757004,-0.085572708,-0.044177625,-0.036249285,-0.326624738,-0.222417046,-0.15826488,-0.010939585,-0.133523918,0.063846007,-0.223098914,-0.302839718,-0.027024967,0.00293501,-0.050047646,0.013988946,0.103030303,-0.163026491,-0.164554308,-0.058280922,-0.316720349,-0.398894606,-0.184372022,-0.164170002,-0.281570421,-0.043720221,-0.096397942,0.025271584,-0.277987421,0.103640175,0.088088431,-0.090299219,-0.038117746,-0.163407662,-0.097998856,-0.052792072,-0.076272156,-0.060644178,-0.095711835,0.00102916,-0.143053173,-0.146941109,-0.164093768,-0.111111111,-0.164821133,-0.315951973,-0.197103107,-0.102191729,0.118810749,-0.259996188,-0.072231751,-0.243877337,0.044025157,-0.186963979,-0.015361159,0.212121212,-0.070478369,-0.299409186,0.099218601,0.002248904,-0.183076043,-0.228206594,-0.169201448,-0.199923766,-0.132151706,-0.067200305,-0.080083857,0.114999047,-0.165465981,-0.104139666,-0.064684582,-0.14808462,0.07314656,0.027558605,0.20038117,-0.156241662,-0.183990852,-0.074290071,-0.112330856,-0.272422337,-0.282180294,-0.096437897,0.078406709,-0.047684391,0.079473985,-0.221879169,0.066133028,-0.165465981,-0.226529445,0.028549647,-0.156699066,-0.097617686,-0.136420812,-0.183457214,-0.033733562,-0.226758148 diff --git a/example-docs/language-docs/eng_spa_mult.ppt b/example-docs/language-docs/eng_spa_mult.ppt index 43ebc36573..d19bfc3bf1 100644 Binary files a/example-docs/language-docs/eng_spa_mult.ppt and b/example-docs/language-docs/eng_spa_mult.ppt differ diff --git a/requirements/base.txt b/requirements/base.txt index 8fa2c493b0..98e2c29f1d 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -21,7 +21,7 @@ charset-normalizer==3.3.2 # unstructured-client click==8.1.7 # via nltk -dataclasses-json==0.6.5 +dataclasses-json==0.6.6 # via -r ./base.in dataclasses-json-speakeasy==0.5.11 # via unstructured-client @@ -39,7 +39,7 @@ jsonpath-python==1.0.6 # via unstructured-client langdetect==1.0.9 # via -r ./base.in -lxml==5.2.1 +lxml==5.2.2 # via -r ./base.in marshmallow==3.21.2 # via @@ -67,7 +67,7 @@ python-magic==0.4.27 # via -r ./base.in rapidfuzz==3.9.0 # via -r ./base.in -regex==2024.4.28 +regex==2024.5.10 # via nltk requests==2.31.0 # via diff --git a/requirements/build.txt b/requirements/build.txt index 51ada53f22..43b2b2232a 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -48,7 +48,7 @@ markdown-it-py==3.0.0 # myst-parser markupsafe==2.1.5 # via jinja2 -mdit-py-plugins==0.4.0 +mdit-py-plugins==0.4.1 # via myst-parser mdurl==0.1.2 # via markdown-it-py diff --git a/requirements/dev.txt b/requirements/dev.txt index 8def7400cb..c9bee6f6ee 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -253,7 +253,7 @@ platformdirs==3.10.0 # -c ./test.txt # jupyter-core # virtualenv -pre-commit==3.7.0 +pre-commit==3.7.1 # via -r ./dev.in prometheus-client==0.20.0 # via jupyter-server diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt index 651e20874a..6b1f08bec9 100644 --- a/requirements/extra-docx.txt +++ b/requirements/extra-docx.txt @@ -4,7 +4,7 @@ # # pip-compile ./extra-docx.in # -lxml==5.2.1 +lxml==5.2.2 # via # -c ./base.txt # python-docx diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index 3240561f00..913651f5e6 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -4,7 +4,7 @@ # # pip-compile ./extra-odt.in # -lxml==5.2.1 +lxml==5.2.2 # via # -c ./base.txt # python-docx diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 153b30a366..8149077ded 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -8,7 +8,7 @@ attrdict==2.0.1 # via unstructured-paddleocr babel==2.15.0 # via flask-babel -bce-python-sdk==0.9.7 +bce-python-sdk==0.9.9 # via visualdl blinker==1.8.2 # via flask @@ -77,7 +77,7 @@ lazy-loader==0.4 # via scikit-image lmdb==1.4.1 # via unstructured-paddleocr -lxml==5.2.1 +lxml==5.2.2 # via # -c ./base.txt # premailer @@ -199,7 +199,7 @@ six==1.16.0 # imgaug # python-dateutil # visualdl -tifffile==2024.5.3 +tifffile==2024.5.10 # via scikit-image tqdm==4.66.4 # via diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index b2ce938e86..58a58d11dd 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -85,7 +85,7 @@ kiwisolver==1.4.5 # via matplotlib layoutparser[layoutmodels,tesseract]==0.3.4 # via unstructured-inference -lxml==5.2.1 +lxml==5.2.2 # via # -c ./base.txt # pikepdf @@ -198,7 +198,7 @@ pyparsing==3.0.9 # matplotlib pypdf==4.2.0 # via -r ./extra-pdf-image.in -pypdfium2==4.29.0 +pypdfium2==4.30.0 # via pdfplumber pytesseract==0.3.10 # via layoutparser @@ -222,7 +222,7 @@ rapidfuzz==3.9.0 # via # -c ./base.txt # unstructured-inference -regex==2024.4.28 +regex==2024.5.10 # via # -c ./base.txt # transformers diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 2657ec992c..f532344908 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -4,7 +4,7 @@ # # pip-compile ./extra-pptx.in # -lxml==5.2.1 +lxml==5.2.2 # via python-pptx pillow==10.3.0 # via python-pptx diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index a425660958..e2865dd155 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -64,7 +64,7 @@ pyyaml==6.0.1 # via # huggingface-hub # transformers -regex==2024.4.28 +regex==2024.5.10 # via # -c ./base.txt # sacremoses diff --git a/requirements/ingest/azure.txt b/requirements/ingest/azure.txt index d7523906bd..8a855d7438 100644 --- a/requirements/ingest/azure.txt +++ b/requirements/ingest/azure.txt @@ -23,7 +23,7 @@ azure-datalake-store==0.0.53 # via adlfs azure-identity==1.16.0 # via adlfs -azure-storage-blob==12.19.1 +azure-storage-blob==12.20.0 # via adlfs certifi==2024.2.2 # via diff --git a/requirements/ingest/delta-table.txt b/requirements/ingest/delta-table.txt index 434d1a3b98..9c5e3d2fde 100644 --- a/requirements/ingest/delta-table.txt +++ b/requirements/ingest/delta-table.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/delta-table.in # -deltalake==0.17.3 +deltalake==0.17.4 # via -r ./ingest/delta-table.in fsspec==2024.3.1 # via -r ./ingest/delta-table.in diff --git a/requirements/ingest/embed-aws-bedrock.txt b/requirements/ingest/embed-aws-bedrock.txt index ed497d1168..bef1c51f31 100644 --- a/requirements/ingest/embed-aws-bedrock.txt +++ b/requirements/ingest/embed-aws-bedrock.txt @@ -30,7 +30,7 @@ charset-normalizer==3.3.2 # via # -c ./ingest/../base.txt # requests -dataclasses-json==0.6.5 +dataclasses-json==0.6.6 # via # -c ./ingest/../base.txt # langchain-community @@ -51,11 +51,11 @@ jsonpatch==1.33 # via langchain-core jsonpointer==2.4 # via jsonpatch -langchain-community==0.0.37 +langchain-community==0.0.38 # via -r ./ingest/embed-aws-bedrock.in langchain-core==0.1.52 # via langchain-community -langsmith==0.1.54 +langsmith==0.1.57 # via # langchain-community # langchain-core diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt index 8732773aff..d176e4bbd7 100644 --- a/requirements/ingest/embed-huggingface.txt +++ b/requirements/ingest/embed-huggingface.txt @@ -23,7 +23,7 @@ charset-normalizer==3.3.2 # via # -c ./ingest/../base.txt # requests -dataclasses-json==0.6.5 +dataclasses-json==0.6.6 # via # -c ./ingest/../base.txt # langchain-community @@ -62,11 +62,11 @@ jsonpatch==1.33 # via langchain-core jsonpointer==2.4 # via jsonpatch -langchain-community==0.0.37 +langchain-community==0.0.38 # via -r ./ingest/embed-huggingface.in langchain-core==0.1.52 # via langchain-community -langsmith==0.1.54 +langsmith==0.1.57 # via # langchain-community # langchain-core @@ -120,7 +120,7 @@ pyyaml==6.0.1 # langchain-community # langchain-core # transformers -regex==2024.4.28 +regex==2024.5.10 # via # -c ./ingest/../base.txt # transformers diff --git a/requirements/ingest/embed-octoai.txt b/requirements/ingest/embed-octoai.txt index 65866ca487..4ed74d29c2 100644 --- a/requirements/ingest/embed-octoai.txt +++ b/requirements/ingest/embed-octoai.txt @@ -38,13 +38,13 @@ idna==3.7 # anyio # httpx # requests -openai==1.26.0 +openai==1.28.1 # via -r ./ingest/embed-octoai.in pydantic==2.7.1 # via openai pydantic-core==2.18.2 # via pydantic -regex==2024.4.28 +regex==2024.5.10 # via # -c ./ingest/../base.txt # tiktoken diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt index 8684d9beb4..c2d9488e80 100644 --- a/requirements/ingest/embed-openai.txt +++ b/requirements/ingest/embed-openai.txt @@ -30,7 +30,7 @@ charset-normalizer==3.3.2 # via # -c ./ingest/../base.txt # requests -dataclasses-json==0.6.5 +dataclasses-json==0.6.6 # via # -c ./ingest/../base.txt # langchain-community @@ -59,11 +59,11 @@ jsonpatch==1.33 # via langchain-core jsonpointer==2.4 # via jsonpatch -langchain-community==0.0.37 +langchain-community==0.0.38 # via -r ./ingest/embed-openai.in langchain-core==0.1.52 # via langchain-community -langsmith==0.1.54 +langsmith==0.1.57 # via # langchain-community # langchain-core @@ -83,7 +83,7 @@ numpy==1.26.4 # via # -c ./ingest/../base.txt # langchain-community -openai==1.26.0 +openai==1.28.1 # via -r ./ingest/embed-openai.in orjson==3.10.3 # via langsmith @@ -104,7 +104,7 @@ pyyaml==6.0.1 # via # langchain-community # langchain-core -regex==2024.4.28 +regex==2024.5.10 # via # -c ./ingest/../base.txt # tiktoken diff --git a/requirements/ingest/embed-vertexai.txt b/requirements/ingest/embed-vertexai.txt index 8197b28ca1..1d2be4dabf 100644 --- a/requirements/ingest/embed-vertexai.txt +++ b/requirements/ingest/embed-vertexai.txt @@ -29,7 +29,7 @@ charset-normalizer==3.3.2 # via # -c ./ingest/../base.txt # requests -dataclasses-json==0.6.5 +dataclasses-json==0.6.6 # via # -c ./ingest/../base.txt # langchain @@ -55,7 +55,7 @@ google-auth==2.29.0 # google-cloud-core # google-cloud-resource-manager # google-cloud-storage -google-cloud-aiplatform==1.50.0 +google-cloud-aiplatform==1.51.0 # via langchain-google-vertexai google-cloud-bigquery==3.22.0 # via google-cloud-aiplatform @@ -98,14 +98,12 @@ idna==3.7 # requests # yarl jsonpatch==1.33 - # via - # langchain - # langchain-core + # via langchain-core jsonpointer==2.4 # via jsonpatch -langchain==0.1.17 +langchain==0.1.20 # via -r ./ingest/embed-vertexai.in -langchain-community==0.0.37 +langchain-community==0.0.38 # via # -r ./ingest/embed-vertexai.in # langchain @@ -119,7 +117,7 @@ langchain-google-vertexai==1.0.3 # via -r ./ingest/embed-vertexai.in langchain-text-splitters==0.0.1 # via langchain -langsmith==0.1.54 +langsmith==0.1.57 # via # langchain # langchain-community diff --git a/requirements/ingest/google-drive.txt b/requirements/ingest/google-drive.txt index 6c600c3c31..c0bc61e783 100644 --- a/requirements/ingest/google-drive.txt +++ b/requirements/ingest/google-drive.txt @@ -17,7 +17,7 @@ charset-normalizer==3.3.2 # requests google-api-core==2.19.0 # via google-api-python-client -google-api-python-client==2.128.0 +google-api-python-client==2.129.0 # via -r ./ingest/google-drive.in google-auth==2.29.0 # via diff --git a/requirements/ingest/salesforce.txt b/requirements/ingest/salesforce.txt index 2ce352a8c9..66881b4c3a 100644 --- a/requirements/ingest/salesforce.txt +++ b/requirements/ingest/salesforce.txt @@ -25,7 +25,7 @@ idna==3.7 # requests isodate==0.6.1 # via zeep -lxml==5.2.1 +lxml==5.2.2 # via # -c ./ingest/../base.txt # zeep diff --git a/requirements/ingest/weaviate.txt b/requirements/ingest/weaviate.txt index 7c72e31f37..18209177d3 100644 --- a/requirements/ingest/weaviate.txt +++ b/requirements/ingest/weaviate.txt @@ -81,7 +81,7 @@ urllib3==1.26.18 # requests validators==0.28.1 # via weaviate-client -weaviate-client==4.5.7 +weaviate-client==4.6.0 # via # -c ./ingest/../deps/constraints.txt # -r ./ingest/weaviate.in diff --git a/requirements/test.txt b/requirements/test.txt index 0f1fa949cd..f8e829eb59 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -37,7 +37,7 @@ flake8==7.0.0 # flake8-print flake8-print==5.0.0 # via -r ./test.in -freezegun==1.5.0 +freezegun==1.5.1 # via -r ./test.in grpcio==1.63.0 # via -r ./test.in @@ -52,7 +52,7 @@ label-studio-sdk==0.0.32 # via -r ./test.in label-studio-tools==0.0.4 # via label-studio-sdk -lxml==5.2.1 +lxml==5.2.2 # via # -c ./base.txt # label-studio-sdk @@ -114,7 +114,7 @@ requests==2.31.0 # via # -c ./base.txt # label-studio-sdk -ruff==0.4.3 +ruff==0.4.4 # via -r ./test.in six==1.16.0 # via diff --git a/test_unstructured/partition/csv/test_csv.py b/test_unstructured/partition/csv/test_csv.py index 4572a00ccc..466d8f0a85 100644 --- a/test_unstructured/partition/csv/test_csv.py +++ b/test_unstructured/partition/csv/test_csv.py @@ -15,7 +15,7 @@ from unstructured.chunking.title import chunk_by_title from unstructured.cleaners.core import clean_extra_whitespace from unstructured.documents.elements import Table -from unstructured.partition.csv import partition_csv +from unstructured.partition.csv import get_delimiter, partition_csv from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA EXPECTED_FILETYPE = "text/csv" @@ -270,3 +270,8 @@ def test_partition_csv_header(): == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX ) assert "" in elements[0].metadata.text_as_html + + +def test_partition_csv_detects_the_right_csv_delimiter(): + # -- Issue #2643: previously raised `_csv.Error: Could not determine delimiter` on this file -- + assert get_delimiter("example-docs/csv-with-long-lines.csv") == "," diff --git a/test_unstructured/partition/docx/test_doc.py b/test_unstructured/partition/docx/test_doc.py index 2d80e18a47..a87722a968 100644 --- a/test_unstructured/partition/docx/test_doc.py +++ b/test_unstructured/partition/docx/test_doc.py @@ -20,59 +20,15 @@ from unstructured.partition.docx import partition_docx -def test_partition_doc_for_deterministic_and_unique_ids(): - ids = [element.id for element in partition_doc("example-docs/duplicate-paragraphs.doc")] - - assert ids == [ - "ade273c622c48d67a7be7b3816d5b4d8", - "7d0b32fdf169f9578723486cb4bc1235", - "1feb6e8e9c1662cfaef75907aeeb0900", - "aa2a8ac10143b12f0fe2087837ea11d2", - "da31ba7ed3919067d2c6572dc1617271", - "1914359c179a160df921b769acf8c353", - "f9d0d379fc791bae487b7a45f65caa50", - ] - - -@pytest.fixture() -def mock_document(): - document = docx.Document() - - document.add_paragraph("These are a few of my favorite things:", style="Heading 1") - # NOTE(robinson) - this should get picked up as a list item due to the • - document.add_paragraph("• Parrots", style="Normal") - # NOTE(robinson) - this should get dropped because it's empty - document.add_paragraph("• ", style="Normal") - document.add_paragraph("Hockey", style="List Bullet") - # NOTE(robinson) - this should get dropped because it's empty - document.add_paragraph("", style="List Bullet") - # NOTE(robinson) - this should get picked up as a title - document.add_paragraph("Analysis", style="Normal") - # NOTE(robinson) - this should get dropped because it is empty - document.add_paragraph("", style="Normal") - # NOTE(robinson) - this should get picked up as a narrative text - document.add_paragraph("This is my first thought. This is my second thought.", style="Normal") - document.add_paragraph("This is my third thought.", style="Body Text") - # NOTE(robinson) - this should just be regular text - document.add_paragraph("2023") - # NOTE(robinson) - this should be an address - document.add_paragraph("DOYLESTOWN, PA 18901") +def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir): + docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") + doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") + mock_document.save(docx_filename) + convert_office_doc(docx_filename, tmpdir.dirname, "doc") + assert partition_doc(filename=doc_filename) == partition_docx(filename=docx_filename) - return document - -@pytest.fixture() -def expected_elements(): - return [ - Title("These are a few of my favorite things:"), - ListItem("Parrots"), - ListItem("Hockey"), - Title("Analysis"), - NarrativeText("This is my first thought. This is my second thought."), - NarrativeText("This is my third thought."), - Text("2023"), - Address("DOYLESTOWN, PA 18901"), - ] +# -- document-source (file or filename) ---------------------------------------------------------- def test_partition_doc_from_filename(mock_document, expected_elements, tmpdir, capsys): @@ -88,36 +44,6 @@ def test_partition_doc_from_filename(mock_document, expected_elements, tmpdir, c assert capsys.readouterr().err == "" -def test_partition_doc_from_filename_with_metadata_filename( - mock_document, - expected_elements, - tmpdir, -): - docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") - doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") - mock_document.save(docx_filename) - convert_office_doc(docx_filename, tmpdir.dirname, "doc") - - elements = partition_doc(filename=doc_filename, metadata_filename="test") - assert elements == expected_elements - assert all(element.metadata.filename == "test" for element in elements) - - -def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir): - docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") - doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") - mock_document.save(docx_filename) - convert_office_doc(docx_filename, tmpdir.dirname, "doc") - assert partition_doc(filename=doc_filename) == partition_docx(filename=docx_filename) - - -def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmpdir): - doc_filename = os.path.join(tmpdir.dirname, "asdf.doc") - - with pytest.raises(ValueError): - partition_doc(filename=doc_filename) - - def test_partition_doc_from_file_with_filter(mock_document, expected_elements, tmpdir, capsys): docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") @@ -148,18 +74,6 @@ def test_partition_doc_from_file_with_no_filter(mock_document, expected_elements assert element.metadata.filename is None -def test_partition_doc_from_file_with_metadata_filename(mock_document, tmpdir): - docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") - doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") - mock_document.save(docx_filename) - convert_office_doc(docx_filename, tmpdir.dirname, "doc") - - with open(doc_filename, "rb") as f: - elements = partition_doc(file=f, metadata_filename="test") - for element in elements: - assert element.metadata.filename == "test" - - def test_partition_doc_raises_with_both_specified(mock_document, tmpdir): docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") @@ -175,33 +89,76 @@ def test_partition_doc_raises_with_neither(): partition_doc() -def test_partition_doc_from_file_exclude_metadata(mock_document, tmpdir): +def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmpdir): + doc_filename = os.path.join(tmpdir.dirname, "asdf.doc") + + with pytest.raises(ValueError): + partition_doc(filename=doc_filename) + + +# -- `include_metadata` arg ---------------------------------------------------------------------- + + +def test_partition_doc_from_filename_exclude_metadata(mock_document, tmpdir): docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") mock_document.save(docx_filename) convert_office_doc(docx_filename, tmpdir.dirname, "doc") - with open(doc_filename, "rb") as f: - elements = partition_doc(file=f, include_metadata=False) + elements = partition_doc(filename=doc_filename, include_metadata=False) assert elements[0].metadata.filetype is None assert elements[0].metadata.page_name is None assert elements[0].metadata.filename is None -def test_partition_doc_from_filename_exclude_metadata(mock_document, tmpdir): +def test_partition_doc_from_file_exclude_metadata(mock_document, tmpdir): docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") mock_document.save(docx_filename) convert_office_doc(docx_filename, tmpdir.dirname, "doc") - elements = partition_doc(filename=doc_filename, include_metadata=False) + with open(doc_filename, "rb") as f: + elements = partition_doc(file=f, include_metadata=False) assert elements[0].metadata.filetype is None assert elements[0].metadata.page_name is None assert elements[0].metadata.filename is None +# -- .metadata.filename -------------------------------------------------------------------------- + + +def test_partition_doc_from_filename_with_metadata_filename( + mock_document, + expected_elements, + tmpdir, +): + docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") + doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") + mock_document.save(docx_filename) + convert_office_doc(docx_filename, tmpdir.dirname, "doc") + + elements = partition_doc(filename=doc_filename, metadata_filename="test") + assert elements == expected_elements + assert all(element.metadata.filename == "test" for element in elements) + + +def test_partition_doc_from_file_with_metadata_filename(mock_document, tmpdir): + docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") + doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") + mock_document.save(docx_filename) + convert_office_doc(docx_filename, tmpdir.dirname, "doc") + + with open(doc_filename, "rb") as f: + elements = partition_doc(file=f, metadata_filename="test") + for element in elements: + assert element.metadata.filename == "test" + + +# -- .metadata.last_modified --------------------------------------------------------------------- + + def test_partition_doc_metadata_date( mocker, filename="example-docs/fake.doc", @@ -283,6 +240,19 @@ def test_partition_doc_from_file_explicit_get_metadata_date( assert elements[0].metadata.last_modified == mocked_last_modification_date +def test_partition_doc_from_file_without_metadata_date( + filename="example-docs/fake.doc", +): + """Test partition_doc() with file that are not possible to get last modified date""" + with open(filename, "rb") as f: + sf = SpooledTemporaryFile() + sf.write(f.read()) + sf.seek(0) + elements = partition_doc(file=sf, date_from_file_object=True) + + assert elements[0].metadata.last_modified is None + + def test_partition_doc_from_file_metadata_date_with_custom_metadata( mocker, filename="example-docs/fake.doc", @@ -302,17 +272,23 @@ def test_partition_doc_from_file_metadata_date_with_custom_metadata( assert elements[0].metadata.last_modified == expected_last_modified_date -def test_partition_doc_from_file_without_metadata_date( - filename="example-docs/fake.doc", -): - """Test partition_doc() with file that are not possible to get last modified date""" - with open(filename, "rb") as f: - sf = SpooledTemporaryFile() - sf.write(f.read()) - sf.seek(0) - elements = partition_doc(file=sf, date_from_file_object=True) +# -- language-recognition metadata --------------------------------------------------------------- - assert elements[0].metadata.last_modified is None + +def test_partition_doc_element_metadata_has_languages(): + filename = "example-docs/fake-doc-emphasized-text.doc" + elements = partition_doc(filename=filename) + assert elements[0].metadata.languages == ["eng"] + + +def test_partition_doc_respects_detect_language_per_element(): + filename = "example-docs/language-docs/eng_spa_mult.doc" + elements = partition_doc(filename=filename, detect_language_per_element=True) + langs = [element.metadata.languages for element in elements] + assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]] + + +# -- miscellaneous ------------------------------------------------------------------------------- def test_partition_doc_grabs_emphasized_texts(): @@ -352,14 +328,59 @@ def test_add_chunking_strategy_on_partition_doc(filename="example-docs/fake.doc" assert chunk_elements == chunks -def test_partition_doc_element_metadata_has_languages(): - filename = "example-docs/fake-doc-emphasized-text.doc" - elements = partition_doc(filename=filename) - assert elements[0].metadata.languages == ["eng"] +def test_partition_doc_for_deterministic_and_unique_ids(): + ids = [element.id for element in partition_doc("example-docs/duplicate-paragraphs.doc")] + assert ids == [ + "ade273c622c48d67a7be7b3816d5b4d8", + "7d0b32fdf169f9578723486cb4bc1235", + "1feb6e8e9c1662cfaef75907aeeb0900", + "aa2a8ac10143b12f0fe2087837ea11d2", + "da31ba7ed3919067d2c6572dc1617271", + "1914359c179a160df921b769acf8c353", + "f9d0d379fc791bae487b7a45f65caa50", + ] -def test_partition_doc_respects_detect_language_per_element(): - filename = "example-docs/language-docs/eng_spa_mult.doc" - elements = partition_doc(filename=filename, detect_language_per_element=True) - langs = [element.metadata.languages for element in elements] - assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]] + +# == module-level fixtures ======================================================================= + + +@pytest.fixture() +def expected_elements(): + return [ + Title("These are a few of my favorite things:"), + ListItem("Parrots"), + ListItem("Hockey"), + Title("Analysis"), + NarrativeText("This is my first thought. This is my second thought."), + NarrativeText("This is my third thought."), + Text("2023"), + Address("DOYLESTOWN, PA 18901"), + ] + + +@pytest.fixture() +def mock_document(): + document = docx.Document() + + document.add_paragraph("These are a few of my favorite things:", style="Heading 1") + # NOTE(robinson) - this should get picked up as a list item due to the • + document.add_paragraph("• Parrots", style="Normal") + # NOTE(robinson) - this should get dropped because it's empty + document.add_paragraph("• ", style="Normal") + document.add_paragraph("Hockey", style="List Bullet") + # NOTE(robinson) - this should get dropped because it's empty + document.add_paragraph("", style="List Bullet") + # NOTE(robinson) - this should get picked up as a title + document.add_paragraph("Analysis", style="Normal") + # NOTE(robinson) - this should get dropped because it is empty + document.add_paragraph("", style="Normal") + # NOTE(robinson) - this should get picked up as a narrative text + document.add_paragraph("This is my first thought. This is my second thought.", style="Normal") + document.add_paragraph("This is my third thought.", style="Body Text") + # NOTE(robinson) - this should just be regular text + document.add_paragraph("2023") + # NOTE(robinson) - this should be an address + document.add_paragraph("DOYLESTOWN, PA 18901") + + return document diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py index 52ed3361e8..9e89d99737 100644 --- a/test_unstructured/partition/docx/test_docx.py +++ b/test_unstructured/partition/docx/test_docx.py @@ -1,16 +1,29 @@ # pyright: reportPrivateUsage=false +"""Test suite for `unstructured.partition.docx` module.""" + +from __future__ import annotations + +import io import pathlib import re -from tempfile import SpooledTemporaryFile -from typing import Dict, List +import tempfile +from typing import Any import docx import pytest from docx.document import Document from pytest_mock import MockFixture -from test_unstructured.unit_utils import assert_round_trips_through_JSON +from test_unstructured.unit_utils import ( + FixtureRequest, + Mock, + assert_round_trips_through_JSON, + example_doc_path, + function_mock, + instance_mock, + property_mock, +) from unstructured.chunking.title import chunk_by_title from unstructured.documents.elements import ( Address, @@ -26,395 +39,80 @@ Text, Title, ) -from unstructured.partition.docx import _DocxPartitioner, partition_docx +from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA +# -- docx-file loading behaviors ----------------------------------------------------------------- -class Describe_DocxPartitioner: - """Unit-test suite for `unstructured.partition.docx._DocxPartitioner`.""" - # -- table behaviors ------------------------------------------------------------------------- +def test_partition_docx_from_filename( + mock_document_file_path: str, expected_elements: list[Element] +): + elements = partition_docx(mock_document_file_path) - def it_can_convert_a_table_to_html(self): - table = docx.Document(example_doc_path("docx-tables.docx")).tables[0] - assert _DocxPartitioner()._convert_table_to_html(table) == ( - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "
Header Col 1 Header Col 2
Lorem ipsum A link example
" - ) + assert elements == expected_elements + assert elements[0].metadata.page_number is None + for element in elements: + assert element.metadata.filename == "mock_document.docx" + if UNSTRUCTURED_INCLUDE_DEBUG_METADATA: + assert {element.metadata.detection_origin for element in elements} == {"docx"} - def and_it_can_convert_a_nested_table_to_html(self): - """ - Fixture table is: - +---+-------------+---+ - | a | >b< | c | - +---+-------------+---+ - | | +-----+---+ | | - | | | e | f | | | - | d | +-----+---+ | i | - | | | g&t | h | | | - | | +-----+---+ | | - +---+-------------+---+ - | j | k | l | - +---+-------------+---+ - """ - table = docx.Document(example_doc_path("docx-tables.docx")).tables[1] +def test_partition_docx_from_filename_with_metadata_filename(mock_document_file_path: str): + elements = partition_docx(mock_document_file_path, metadata_filename="test") + assert all(element.metadata.filename == "test" for element in elements) - # -- re.sub() strips out the extra padding inserted by tabulate -- - html = re.sub(r" +<", "<", _DocxPartitioner()._convert_table_to_html(table)) - expected_lines = [ - "", - "", - "", - "", - "", - "", - "", - "", - "
a>b<c
d", - "", - "", - "", - "", - "
ef
g&th
i
jkl
", - ] - actual_lines = html.splitlines() - for expected, actual in zip(expected_lines, actual_lines): - assert actual == expected, f"\nexpected: {repr(expected)}\nactual: {repr(actual)}" +def test_partition_docx_with_spooled_file( + mock_document_file_path: str, expected_elements: list[Text] +): + """`partition_docx()` accepts a SpooledTemporaryFile as its `file` argument. - def it_can_convert_a_table_to_plain_text(self): - table = docx.Document(example_doc_path("docx-tables.docx")).tables[0] - assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == ( - "Header Col 1 Header Col 2 Lorem ipsum A link example" - ) + `python-docx` will NOT accept a `SpooledTemporaryFile` in Python versions before 3.11 so we need + to ensure the source file is appropriately converted in this case. + """ + with open(mock_document_file_path, "rb") as test_file: + spooled_temp_file = tempfile.SpooledTemporaryFile() + spooled_temp_file.write(test_file.read()) + spooled_temp_file.seek(0) + elements = partition_docx(file=spooled_temp_file) + assert elements == expected_elements + for element in elements: + assert element.metadata.filename is None - def and_it_can_convert_a_nested_table_to_plain_text(self): - """ - Fixture table is: - +---+-------------+---+ - | a | >b< | c | - +---+-------------+---+ - | | +-----+---+ | | - | | | e | f | | | - | d | +-----+---+ | i | - | | | g&t | h | | | - | | +-----+---+ | | - +---+-------------+---+ - | j | k | l | - +---+-------------+---+ - """ - table = docx.Document(example_doc_path("docx-tables.docx")).tables[1] - assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == ( - "a >b< c d e f g&t h i j k l" - ) +def test_partition_docx_from_file(mock_document_file_path: str, expected_elements: list[Text]): + with open(mock_document_file_path, "rb") as f: + elements = partition_docx(file=f) + assert elements == expected_elements + for element in elements: + assert element.metadata.filename is None - def but_the_text_of_a_merged_cell_appears_only_once(self): - """ - Fixture table is: - +---+-------+ - | a | b | - | +---+---+ - | | c | d | - +---+---+ | - | e | | - +-------+---+ - """ - table = docx.Document(example_doc_path("docx-tables.docx")).tables[2] - assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == "a b c d e" +def test_partition_docx_from_file_with_metadata_filename( + mock_document_file_path: str, expected_elements: list[Text] +): + with open(mock_document_file_path, "rb") as f: + elements = partition_docx(file=f, metadata_filename="test") + assert elements == expected_elements + for element in elements: + assert element.metadata.filename == "test" - def it_can_partition_tables_with_incomplete_rows(self): - """DOCX permits table rows to start late and end early. - It is relatively rare in the wild, but DOCX tables are unique (as far as I know) in that - they allow rows to start late, like in column 3, and end early, like the last cell is in - column 5 of a 7 column table. +def test_partition_docx_uses_file_path_when_both_are_specified( + mock_document_file_path: str, expected_elements: list[Text] +): + f = io.BytesIO(b"abcde") + elements = partition_docx(filename=mock_document_file_path, file=f) + assert elements == expected_elements - A practical example might look like this: - +------+------+ - | East | West | - +----------+------+------+ - | Started | 25 | 32 | - +----------+------+------+ - | Finished | 17 | 21 | - +----------+------+------+ - """ - elements = iter(partition_docx(example_doc_path("tables-with-incomplete-rows.docx"))) +def test_partition_docx_raises_with_neither(): + with pytest.raises(ValueError, match="either `filename` or `file` argument must be provided"): + partition_docx() - e = next(elements) - assert e.text.startswith("Example of DOCX table ") - # -- - # ┌───┬───┐ - # │ a │ b │ - # ├───┼───┤ - # │ c │ d │ - # └───┴───┘ - e = next(elements) - assert type(e).__name__ == "Table" - assert e.text == "a b c d" - assert e.metadata.text_as_html == ( - "\n" - "\n\n\n" - "\n\n\n" - "
a b
c d
" - ) - # -- - # ┌───┐ - # │ a │ - # ├───┼───┐ - # │ b │ c │ - # └───┴───┘ - e = next(elements) - assert type(e).__name__ == "Table" - assert e.text == "a b c", f"actual {e.text=}" - assert e.metadata.text_as_html == ( - "\n" - "\n\n\n" - "\n\n\n" - "
a
b c
" - ), f"actual {e.metadata.text_as_html=}" - # -- - # ┌───────┐ - # │ a │ - # ├───┬───┼───┐ - # │ b │ c │ d │ - # └───┴───┴───┘ - e = next(elements) - assert type(e).__name__ == "Table" - assert e.text == "a b c d", f"actual {e.text=}" - assert e.metadata.text_as_html == ( - "\n" - "\n\n\n" - "\n\n\n" - "
a a
b c d
" - ), f"actual {e.metadata.text_as_html=}" - # -- - # ┌───┬───┐ - # │ │ b │ - # │ a ├───┼───┐ - # │ │ c │ d │ - # └───┴───┴───┘ - e = next(elements) - assert type(e).__name__ == "Table" - assert e.text == "a b c d", f"actual {e.text=}" - assert e.metadata.text_as_html == ( - "\n" - "\n\n\n" - "\n\n\n" - "
a b
a c d
" - ), f"actual {e.metadata.text_as_html=}" - # -- late-start, early-end, and >2 rows vertical span -- - # ┌───────┬───┬───┐ - # │ a │ b │ c │ - # └───┬───┴───┼───┘ - # │ d │ - # ┌───┤ ├───┐ - # │ e │ │ f │ - # └───┤ ├───┘ - # │ │ - # └───────┘ - e = next(elements) - assert type(e).__name__ == "Table" - assert e.text == "a b c d e f", f"actual {e.text=}" - assert e.metadata.text_as_html == ( - "\n" - "\n" - "\n" - "\n\n" - "\n" - "\n" - "\n" - "\n" - "
a a b c
d d
e d d f
d d
" - ), f"actual {e.metadata.text_as_html=}" - # -- - # -- The table from the specimen file we received with the bug report. -- - e = next(elements) - assert type(e).__name__ == "Table" - assert e.text == "Data More Dato WTF? Strange Format", f"actual {e.text=}" - assert e.metadata.text_as_html == ( - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "
Data Data
Data Data
Data Data
More
Dato
WTF? WTF?
StrangeStrange
Format Format
" - ), f"actual {e.metadata.text_as_html=}" - # -- page-break behaviors -------------------------------------------------------------------- - - def it_places_page_breaks_precisely_where_they_occur(self): - """Page-break behavior has some subtleties. - - * A hard page-break does not generate a PageBreak element (because that would double-count - it). Word inserts a rendered page-break for the hard break at the effective location. - * A (rendered) page-break mid-paragraph produces two elements, like `Text, PageBreak, Text`, - so each Text (subclass) element gets the right page-number. - * A rendered page-break mid-hyperlink produces two text elements, but the hyperlink itself - is not split; the entire hyperlink goes on the page where the hyperlink starts, even - though some of its text appears on the following page. The rest of the paragraph, after - the hyperlink, appears on the following page. - * Odd and even-page section starts can lead to two page-breaks, like an odd-page section - start could go from page 3 to page 5 because 5 is the next odd page. - """ - - def str_repr(e: Element) -> str: - """A more detailed `repr()` to aid debugging when assertion fails.""" - return f"{e.__class__.__name__}('{e}')" - - expected = [ - # NOTE(scanny) - -- page 1 -- - NarrativeText( - "First page, tab here:\t" - "followed by line-break here:\n" - "here:\n" - "and here:\n" - "no-break hyphen here:-" - "and hard page-break here>>" - ), - PageBreak(""), - # NOTE(scanny) - -- page 2 -- - NarrativeText( - "<> <>"), - NarrativeText("<>"), - PageBreak(""), - # NOTE(scanny) - -- page 4 -- - PageBreak(""), - # NOTE(scanny) - -- page 5 -- - NarrativeText("<> ' - ), - PageBreak(""), - # NOTE(scanny) - -- page 6 -- - Title("< str: - """String path to a file in the example-docs/ directory.""" - return str(pathlib.Path(__file__).parent.parent.parent.parent / "example-docs" / filename) - - @pytest.fixture() -def expected_elements() -> List[Text]: +def expected_elements() -> list[Text]: return [ Title("These are a few of my favorite things:"), ListItem("Parrots"), @@ -937,12 +633,12 @@ def expected_elements() -> List[Text]: @pytest.fixture() -def expected_emphasized_text_contents() -> List[str]: +def expected_emphasized_text_contents() -> list[str]: return ["bold", "italic", "bold-italic", "bold-italic"] @pytest.fixture() -def expected_emphasized_text_tags() -> List[str]: +def expected_emphasized_text_tags() -> list[str]: return ["b", "i", "b", "i"] @@ -990,16 +686,620 @@ def mock_document_file_path(mock_document: Document, tmp_path: pathlib.Path) -> return filename -def test_ids_are_unique_and_deterministic(): - elements = partition_docx("example-docs/duplicate-paragraphs.docx") +@pytest.fixture() +def opts_args() -> dict[str, Any]: + """All default arguments for `DocxPartitionerOptions`. + + Individual argument values can be changed to suit each test. Makes construction of opts more + compact for testing purposes. + """ + return { + "date_from_file_object": False, + "file": None, + "file_path": None, + "include_page_breaks": True, + "infer_table_structure": True, + "metadata_file_path": None, + "metadata_last_modified": None, + } + + +# ================================================================================================ +# ISOLATED UNIT TESTS +# ================================================================================================ +# These test components used by `partition_docx()` in isolation such that all edge cases can be +# exercised. +# ================================================================================================ + + +class DescribeDocxPartitionerOptions: + """Unit-test suite for `unstructured.partition.docx.DocxPartitionerOptions` objects.""" + + # -- .document ------------------------------- + + def it_loads_the_docx_document( + self, + request: FixtureRequest, + opts_args: dict[str, Any], + ): + document_ = instance_mock(request, Document) + docx_Document_ = function_mock( + request, "unstructured.partition.docx.docx.Document", return_value=document_ + ) + _docx_file_prop_ = property_mock( + request, DocxPartitionerOptions, "_docx_file", return_value="abcde.docx" + ) + opts = DocxPartitionerOptions(**opts_args) + + document = opts.document + + _docx_file_prop_.assert_called_once_with() + docx_Document_.assert_called_once_with("abcde.docx") + assert document is document_ + + # -- .include_page_breaks -------------------- + + @pytest.mark.parametrize("arg_value", [True, False]) + def it_knows_whether_to_emit_PageBreak_elements_as_part_of_the_output_element_stream( + self, arg_value: bool, opts_args: dict[str, Any] + ): + opts_args["include_page_breaks"] = arg_value + opts = DocxPartitionerOptions(**opts_args) + + assert opts.include_page_breaks is arg_value + + # -- .infer_table_structure ------------------ + + @pytest.mark.parametrize("arg_value", [True, False]) + def it_knows_whether_to_include_text_as_html_in_Table_metadata( + self, arg_value: bool, opts_args: dict[str, Any] + ): + opts_args["infer_table_structure"] = arg_value + opts = DocxPartitionerOptions(**opts_args) + + assert opts.infer_table_structure is arg_value + + # -- .increment_page_number() ---------------- + + def it_generates_a_PageBreak_element_when_the_page_number_is_incremented( + self, opts_args: dict[str, Any] + ): + opts = DocxPartitionerOptions(**opts_args) + + page_break_iter = opts.increment_page_number() + + assert isinstance(next(page_break_iter, None), PageBreak) + assert opts.page_number == 2 + with pytest.raises(StopIteration): + next(page_break_iter) + + def but_it_does_not_generate_a_PageBreak_element_when_include_page_breaks_option_is_off( + self, opts_args: dict[str, Any] + ): + opts_args["include_page_breaks"] = False + opts = DocxPartitionerOptions(**opts_args) + + page_break_iter = opts.increment_page_number() + + with pytest.raises(StopIteration): + next(page_break_iter) + assert opts.page_number == 2 + + # -- .last_modified -------------------------- + + def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided( + self, opts_args: dict[str, Any] + ): + opts_args["metadata_last_modified"] = "2024-03-05T17:02:53" + opts = DocxPartitionerOptions(**opts_args) + + assert opts.last_modified == "2024-03-05T17:02:53" + + def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided( + self, opts_args: dict[str, Any], get_last_modified_date_: Mock + ): + opts_args["file_path"] = "a/b/document.docx" + get_last_modified_date_.return_value = "2024-04-02T20:32:35" + opts = DocxPartitionerOptions(**opts_args) + + last_modified = opts.last_modified + + get_last_modified_date_.assert_called_once_with("a/b/document.docx") + assert last_modified == "2024-04-02T20:32:35" + + def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_file_like_object_is_provided( + self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock + ): + file = io.BytesIO(b"abcdefg") + opts_args["file"] = file + opts_args["date_from_file_object"] = True + get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07" + opts = DocxPartitionerOptions(**opts_args) + + last_modified = opts.last_modified + + get_last_modified_date_from_file_.assert_called_once_with(file) + assert last_modified == "2024-04-02T20:42:07" + + def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False( + self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock + ): + file = io.BytesIO(b"abcdefg") + opts_args["file"] = file + opts_args["date_from_file_object"] = False + get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07" + opts = DocxPartitionerOptions(**opts_args) + + last_modified = opts.last_modified + + get_last_modified_date_from_file_.assert_not_called() + assert last_modified is None + + # -- .metadata_file_path --------------------- + + def it_uses_the_user_provided_file_path_in_the_metadata_when_provided( + self, opts_args: dict[str, Any] + ): + opts_args["file_path"] = "x/y/z.docx" + opts_args["metadata_file_path"] = "a/b/c.docx" + opts = DocxPartitionerOptions(**opts_args) + + assert opts.metadata_file_path == "a/b/c.docx" + + @pytest.mark.parametrize("file_path", ["u/v/w.docx", None]) + def and_it_falls_back_to_the_document_file_path_otherwise( + self, file_path: str | None, opts_args: dict[str, Any] + ): + opts_args["file_path"] = file_path + opts_args["metadata_file_path"] = None + opts = DocxPartitionerOptions(**opts_args) + + assert opts.metadata_file_path == file_path + + # -- ._metadata_page_number ------------------ + + @pytest.mark.parametrize( + ("page_count", "document_contains_pagebreaks", "expected_value"), + [(7, True, 7), (1, False, None)], + ) + def it_reports_None_when_no_rendered_page_breaks_are_found_in_document( + self, + request: FixtureRequest, + opts_args: dict[str, Any], + page_count: int, + document_contains_pagebreaks: bool, + expected_value: int | None, + ): + _document_contains_pagebreaks_prop_ = property_mock( + request, + DocxPartitionerOptions, + "_document_contains_pagebreaks", + return_value=document_contains_pagebreaks, + ) + opts = DocxPartitionerOptions(**opts_args) + opts._page_counter = page_count + + metadata_page_number = opts.metadata_page_number + + _document_contains_pagebreaks_prop_.assert_called_once_with() + assert metadata_page_number is expected_value + + # -- .page_number ---------------------------- + + def it_keeps_track_of_the_page_number(self, opts_args: dict[str, Any]): + """In DOCX, page-number is the slide number.""" + opts = DocxPartitionerOptions(**opts_args) + + assert opts.page_number == 1 + list(opts.increment_page_number()) + assert opts.page_number == 2 + list(opts.increment_page_number()) + assert opts.page_number == 3 + + def it_assigns_the_correct_page_number_when_starting_page_number_is_given( + self, opts_args: dict[str, Any] + ): + opts = DocxPartitionerOptions(**opts_args, starting_page_number=3) + + assert opts.page_number == 3 + list(opts.increment_page_number()) + assert opts.page_number == 4 + + # -- ._document_contains_pagebreaks ---------- + + @pytest.mark.parametrize( + ("file_name", "expected_value"), [("page-breaks.docx", True), ("teams_chat.docx", False)] + ) + def it_knows_whether_the_document_contains_page_breaks( + self, opts_args: dict[str, Any], file_name: str, expected_value: bool + ): + opts_args["file_path"] = example_doc_path(file_name) + opts = DocxPartitionerOptions(**opts_args) + + assert opts._document_contains_pagebreaks is expected_value + + # -- ._docx_file ----------------------------- + + def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided( + self, opts_args: dict[str, Any] + ): + opts_args["file_path"] = "l/m/n.docx" + opts = DocxPartitionerOptions(**opts_args) + + assert opts._docx_file == "l/m/n.docx" + + def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided( + self, opts_args: dict[str, Any] + ): + spooled_temp_file = tempfile.SpooledTemporaryFile() + spooled_temp_file.write(b"abcdefg") + opts_args["file"] = spooled_temp_file + opts = DocxPartitionerOptions(**opts_args) + + docx_file = opts._docx_file + + assert docx_file is not spooled_temp_file + assert isinstance(docx_file, io.BytesIO) + assert docx_file.getvalue() == b"abcdefg" + + def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile( + self, opts_args: dict[str, Any] + ): + file = io.BytesIO(b"abcdefg") + opts_args["file"] = file + opts = DocxPartitionerOptions(**opts_args) + + docx_file = opts._docx_file + + assert docx_file is file + assert isinstance(docx_file, io.BytesIO) + assert docx_file.getvalue() == b"abcdefg" + + def but_it_raises_ValueError_when_neither_a_file_path_or_file_is_provided( + self, opts_args: dict[str, Any] + ): + opts = DocxPartitionerOptions(**opts_args) + + with pytest.raises(ValueError, match="No DOCX document specified, either `filename` or "): + opts._docx_file + + # -- fixtures -------------------------------------------------------------------------------- + + @pytest.fixture() + def get_last_modified_date_(self, request: FixtureRequest) -> Mock: + return function_mock(request, "unstructured.partition.docx.get_last_modified_date") + + @pytest.fixture() + def get_last_modified_date_from_file_(self, request: FixtureRequest): + return function_mock( + request, "unstructured.partition.docx.get_last_modified_date_from_file" + ) + + +class Describe_DocxPartitioner: + """Unit-test suite for `unstructured.partition.docx._DocxPartitioner`.""" + + # -- table behaviors ------------------------------------------------------------------------- + + def it_can_convert_a_table_to_html(self, opts_args: dict[str, Any]): + opts = DocxPartitionerOptions(**opts_args) + table = docx.Document(example_doc_path("docx-tables.docx")).tables[0] + + assert _DocxPartitioner(opts)._convert_table_to_html(table) == ( + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "
Header Col 1 Header Col 2
Lorem ipsum A link example
" + ) + + def and_it_can_convert_a_nested_table_to_html(self, opts_args: dict[str, Any]): + """ + Fixture table is: + + +---+-------------+---+ + | a | >b< | c | + +---+-------------+---+ + | | +-----+---+ | | + | | | e | f | | | + | d | +-----+---+ | i | + | | | g&t | h | | | + | | +-----+---+ | | + +---+-------------+---+ + | j | k | l | + +---+-------------+---+ + """ + opts = DocxPartitionerOptions(**opts_args) + table = docx.Document(example_doc_path("docx-tables.docx")).tables[1] + + # -- re.sub() strips out the extra padding inserted by tabulate -- + html = re.sub(r" +<", "<", _DocxPartitioner(opts)._convert_table_to_html(table)) - ids = [e.id for e in elements] - assert ids == [ - "2f22d82eea1faf5f40dac60cef52700e", - "ca9e1f448e531a5152d960e14eefc360", - "9ddeacb172ac17fb45e6f3f15f3c703d", - "a4fd85d3f4141acae38c8f9c936ed2f3", - "44ebaaf66640719c918246d4ccba1c45", - "f36e8ebcb3b6a051940a168fe73cbc44", - "532b395177652c7d61e1e4d855f1dc1d", - ], "IDs are not deterministic" + expected_lines = [ + "", + "", + "", + "", + "", + "", + "", + "", + "
a>b<c
d", + "", + "", + "", + "", + "
ef
g&th
i
jkl
", + ] + actual_lines = html.splitlines() + for expected, actual in zip(expected_lines, actual_lines): + assert actual == expected, f"\nexpected: {repr(expected)}\nactual: {repr(actual)}" + + def it_can_convert_a_table_to_plain_text(self, opts_args: dict[str, Any]): + opts = DocxPartitionerOptions(**opts_args) + table = docx.Document(example_doc_path("docx-tables.docx")).tables[0] + + assert " ".join(_DocxPartitioner(opts)._iter_table_texts(table)) == ( + "Header Col 1 Header Col 2 Lorem ipsum A link example" + ) + + def and_it_can_convert_a_nested_table_to_plain_text(self, opts_args: dict[str, Any]): + """ + Fixture table is: + + +---+-------------+---+ + | a | >b< | c | + +---+-------------+---+ + | | +-----+---+ | | + | | | e | f | | | + | d | +-----+---+ | i | + | | | g&t | h | | | + | | +-----+---+ | | + +---+-------------+---+ + | j | k | l | + +---+-------------+---+ + """ + opts = DocxPartitionerOptions(**opts_args) + table = docx.Document(example_doc_path("docx-tables.docx")).tables[1] + + assert " ".join(_DocxPartitioner(opts)._iter_table_texts(table)) == ( + "a >b< c d e f g&t h i j k l" + ) + + def but_the_text_of_a_merged_cell_appears_only_once(self, opts_args: dict[str, Any]): + """ + Fixture table is: + + +---+-------+ + | a | b | + | +---+---+ + | | c | d | + +---+---+ | + | e | | + +-------+---+ + """ + opts = DocxPartitionerOptions(**opts_args) + table = docx.Document(example_doc_path("docx-tables.docx")).tables[2] + assert " ".join(_DocxPartitioner(opts)._iter_table_texts(table)) == "a b c d e" + + def it_can_partition_tables_with_incomplete_rows(self): + """DOCX permits table rows to start late and end early. + + It is relatively rare in the wild, but DOCX tables are unique (as far as I know) in that + they allow rows to start late, like in column 3, and end early, like the last cell is in + column 5 of a 7 column table. + + A practical example might look like this: + + +------+------+ + | East | West | + +----------+------+------+ + | Started | 25 | 32 | + +----------+------+------+ + | Finished | 17 | 21 | + +----------+------+------+ + """ + elements = iter(partition_docx(example_doc_path("tables-with-incomplete-rows.docx"))) + + e = next(elements) + assert e.text.startswith("Example of DOCX table ") + # -- + # ┌───┬───┐ + # │ a │ b │ + # ├───┼───┤ + # │ c │ d │ + # └───┴───┘ + e = next(elements) + assert type(e).__name__ == "Table" + assert e.text == "a b c d" + assert e.metadata.text_as_html == ( + "\n" + "\n\n\n" + "\n\n\n" + "
a b
c d
" + ) + # -- + # ┌───┐ + # │ a │ + # ├───┼───┐ + # │ b │ c │ + # └───┴───┘ + e = next(elements) + assert type(e).__name__ == "Table" + assert e.text == "a b c", f"actual {e.text=}" + assert e.metadata.text_as_html == ( + "\n" + "\n\n\n" + "\n\n\n" + "
a
b c
" + ), f"actual {e.metadata.text_as_html=}" + # -- + # ┌───────┐ + # │ a │ + # ├───┬───┼───┐ + # │ b │ c │ d │ + # └───┴───┴───┘ + e = next(elements) + assert type(e).__name__ == "Table" + assert e.text == "a b c d", f"actual {e.text=}" + assert e.metadata.text_as_html == ( + "\n" + "\n\n\n" + "\n\n\n" + "
a a
b c d
" + ), f"actual {e.metadata.text_as_html=}" + # -- + # ┌───┬───┐ + # │ │ b │ + # │ a ├───┼───┐ + # │ │ c │ d │ + # └───┴───┴───┘ + e = next(elements) + assert type(e).__name__ == "Table" + assert e.text == "a b c d", f"actual {e.text=}" + assert e.metadata.text_as_html == ( + "\n" + "\n\n\n" + "\n\n\n" + "
a b
a c d
" + ), f"actual {e.metadata.text_as_html=}" + # -- late-start, early-end, and >2 rows vertical span -- + # ┌───────┬───┬───┐ + # │ a │ b │ c │ + # └───┬───┴───┼───┘ + # │ d │ + # ┌───┤ ├───┐ + # │ e │ │ f │ + # └───┤ ├───┘ + # │ │ + # └───────┘ + e = next(elements) + assert type(e).__name__ == "Table" + assert e.text == "a b c d e f", f"actual {e.text=}" + assert e.metadata.text_as_html == ( + "\n" + "\n" + "\n" + "\n\n" + "\n" + "\n" + "\n" + "\n" + "
a a b c
d d
e d d f
d d
" + ), f"actual {e.metadata.text_as_html=}" + # -- + # -- The table from the specimen file we received with the bug report. -- + e = next(elements) + assert type(e).__name__ == "Table" + assert e.text == "Data More Dato WTF? Strange Format", f"actual {e.text=}" + assert e.metadata.text_as_html == ( + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "
Data Data
Data Data
Data Data
More
Dato
WTF? WTF?
StrangeStrange
Format Format
" + ), f"actual {e.metadata.text_as_html=}" + + # -- page-break behaviors -------------------------------------------------------------------- + + def it_places_page_breaks_precisely_where_they_occur(self, opts_args: dict[str, Any]): + """Page-break behavior has some subtleties. + + * A hard page-break does not generate a PageBreak element (because that would double-count + it). Word inserts a rendered page-break for the hard break at the effective location. + * A (rendered) page-break mid-paragraph produces two elements, like `Text, PageBreak, Text`, + so each Text (subclass) element gets the right page-number. + * A rendered page-break mid-hyperlink produces two text elements, but the hyperlink itself + is not split; the entire hyperlink goes on the page where the hyperlink starts, even + though some of its text appears on the following page. The rest of the paragraph, after + the hyperlink, appears on the following page. + * Odd and even-page section starts can lead to two page-breaks, like an odd-page section + start could go from page 3 to page 5 because 5 is the next odd page. + """ + + def str_repr(e: Element) -> str: + """A more detailed `repr()` to aid debugging when assertion fails.""" + return f"{e.__class__.__name__}('{e}')" + + opts_args["file_path"] = example_doc_path("page-breaks.docx") + opts = DocxPartitionerOptions(**opts_args) + expected = [ + # NOTE(scanny) - -- page 1 -- + NarrativeText( + "First page, tab here:\t" + "followed by line-break here:\n" + "here:\n" + "and here:\n" + "no-break hyphen here:-" + "and hard page-break here>>" + ), + PageBreak(""), + # NOTE(scanny) - -- page 2 -- + NarrativeText( + "<> <>"), + NarrativeText("<>"), + PageBreak(""), + # NOTE(scanny) - -- page 4 -- + PageBreak(""), + # NOTE(scanny) - -- page 5 -- + NarrativeText("<> ' + ), + PageBreak(""), + # NOTE(scanny) - -- page 6 -- + Title("< Callable[_P, lis def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]: """The decorated function is replaced with this one.""" - def get_call_args_applying_defaults() -> dict[str, Any]: - """Map both explicit and default arguments of decorated func call by param name.""" - sig = inspect.signature(func) - call_args: dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs) - for param in sig.parameters.values(): - if param.name not in call_args and param.default is not param.empty: - call_args[param.name] = param.default - return call_args - # -- call the partitioning function to get the elements -- elements = func(*args, **kwargs) # -- look for a chunking-strategy argument -- - call_args = get_call_args_applying_defaults() + call_args = get_call_args_applying_defaults(func, *args, **kwargs) chunking_strategy = call_args.pop("chunking_strategy", None) # -- no chunking-strategy means no chunking -- diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 976ab1271b..b2cc258219 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -6,7 +6,6 @@ import enum import functools import hashlib -import inspect import os import pathlib import re @@ -23,7 +22,7 @@ RelativeCoordinateSystem, ) from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA -from unstructured.utils import lazyproperty +from unstructured.utils import get_call_args_applying_defaults, lazyproperty Point: TypeAlias = "tuple[float, float]" Points: TypeAlias = "tuple[Point, ...]" @@ -568,20 +567,16 @@ def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]: @functools.wraps(func) def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]: elements = func(*args, **kwargs) - sig = inspect.signature(func) - params: dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs) - for param in sig.parameters.values(): - if param.name not in params and param.default is not param.empty: - params[param.name] = param.default + call_args = get_call_args_applying_defaults(func, *args, **kwargs) - regex_metadata: dict["str", "str"] = params.get("regex_metadata", {}) + regex_metadata: dict["str", "str"] = call_args.get("regex_metadata", {}) # -- don't write an empty `{}` to metadata.regex_metadata when no regex-metadata was # -- requested, otherwise it will serialize (because it's not None) when it has no # -- meaning or is even misleading. Also it complicates tests that don't use regex-meta. if regex_metadata: elements = _add_regex_metadata(elements, regex_metadata) - unique_element_ids: bool = params.get("unique_element_ids", False) + unique_element_ids: bool = call_args.get("unique_element_ids", False) if unique_element_ids is False: elements = assign_and_map_hash_ids(elements) diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 244323d35e..be6efd451b 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -2,12 +2,11 @@ import enum import functools -import inspect import json import os import re import zipfile -from typing import IO, Any, Callable, Dict, List, Optional +from typing import IO, Callable, List, Optional from typing_extensions import ParamSpec @@ -20,6 +19,7 @@ remove_element_metadata, set_element_hierarchy, ) +from unstructured.utils import get_call_args_applying_defaults try: import magic @@ -580,18 +580,14 @@ def add_metadata(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element @functools.wraps(func) def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: elements = func(*args, **kwargs) - sig = inspect.signature(func) - params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs) - for param in sig.parameters.values(): - if param.name not in params and param.default is not param.empty: - params[param.name] = param.default - include_metadata = params.get("include_metadata", True) + call_args = get_call_args_applying_defaults(func, *args, **kwargs) + include_metadata = call_args.get("include_metadata", True) if include_metadata: - if params.get("metadata_filename"): - params["filename"] = params.get("metadata_filename") + if call_args.get("metadata_filename"): + call_args["filename"] = call_args.get("metadata_filename") metadata_kwargs = { - kwarg: params.get(kwarg) for kwarg in ("filename", "url", "text_as_html") + kwarg: call_args.get(kwarg) for kwarg in ("filename", "url", "text_as_html") } # NOTE (yao): do not use cast here as cast(None) still is None if not str(kwargs.get("model_name", "")).startswith("chipper"): @@ -620,16 +616,9 @@ def decorator(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]: @functools.wraps(func) def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: elements = func(*args, **kwargs) - sig = inspect.signature(func) - params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs) - for param in sig.parameters.values(): - if param.name not in params and param.default is not param.empty: - params[param.name] = param.default + params = get_call_args_applying_defaults(func, *args, **kwargs) include_metadata = params.get("include_metadata", True) if include_metadata: - if params.get("metadata_filename"): - params["filename"] = params.get("metadata_filename") - for element in elements: # NOTE(robinson) - Attached files have already run through this logic # in their own partitioning function diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 69dd809779..4d5b4da8c0 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -3,7 +3,7 @@ from __future__ import annotations import io -from typing import IO, Any, Callable, Optional +from typing import IO, Any, Callable, Literal, Optional import requests @@ -139,7 +139,7 @@ def partition( include_page_breaks: bool = False, strategy: str = PartitionStrategy.AUTO, encoding: Optional[str] = None, - paragraph_grouper: Optional[Callable[[str], str]] = None, + paragraph_grouper: Optional[Callable[[str], str]] | Literal[False] = None, headers: dict[str, str] = {}, skip_infer_table_types: list[str] = [], ssl_verify: bool = True, diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py index bb47428c06..78ed29ea5b 100644 --- a/unstructured/partition/csv.py +++ b/unstructured/partition/csv.py @@ -111,19 +111,23 @@ def partition_csv( return list(elements) -def get_delimiter(file_path=None, file=None): - """ - Use the standard csv sniffer to determine the delimiter. - Read just a small portion in case the file is large. +def get_delimiter(file_path: str | None = None, file: IO[bytes] | None = None): + """Use the standard csv sniffer to determine the delimiter. + + Reads just a small portion in case the file is large. """ sniffer = csv.Sniffer() + num_bytes = 65536 - num_bytes = 8192 + # -- read whole lines, sniffer can be confused by a trailing partial line -- if file: - data = file.read(num_bytes).decode("utf-8") + lines = file.readlines(num_bytes) file.seek(0) - else: + data = "\n".join(ln.decode("utf-8") for ln in lines) + elif file_path is not None: with open(file_path) as f: - data = f.read(num_bytes) + data = "\n".join(f.readlines(num_bytes)) + else: + raise ValueError("either `file_path` or `file` argument must be provided") - return sniffer.sniff(data, delimiters=[",", ";"]).delimiter + return sniffer.sniff(data, delimiters=",;").delimiter diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 377045b886..430f87a301 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -7,19 +7,7 @@ import itertools import os import tempfile -from tempfile import SpooledTemporaryFile -from typing import ( - IO, - Any, - Dict, - Iterator, - List, - Optional, - Tuple, - Type, - Union, - cast, -) +from typing import IO, Any, Iterator, Optional, Type, cast # -- CT_* stands for "complex-type", an XML element type in docx parlance -- import docx @@ -80,8 +68,8 @@ import pypandoc DETECTION_ORIGIN: str = "docx" -BlockElement: TypeAlias = Union[CT_P, CT_Tbl] -BlockItem: TypeAlias = Union[Paragraph, DocxTable] +BlockElement: TypeAlias = "CT_P | CT_Tbl" +BlockItem: TypeAlias = "Paragraph | DocxTable" @requires_dependencies("pypandoc") @@ -93,10 +81,10 @@ def convert_and_partition_docx( infer_table_structure: bool = True, metadata_filename: Optional[str] = None, metadata_last_modified: Optional[str] = None, - languages: Optional[List[str]] = ["auto"], + languages: Optional[list[str]] = ["auto"], detect_language_per_element: bool = False, starting_page_number: int = 1, -) -> List[Element]: +) -> list[Element]: """Converts a document to DOCX and then partitions it using partition_docx. Works with any file format support by pandoc. @@ -183,18 +171,16 @@ def extract_docx_filename(file_path: str) -> str: def partition_docx( filename: Optional[str] = None, file: Optional[IO[bytes]] = None, - metadata_filename: Optional[str] = None, include_page_breaks: bool = True, - include_metadata: bool = True, # used by decorator infer_table_structure: bool = True, + metadata_filename: Optional[str] = None, metadata_last_modified: Optional[str] = None, - chunking_strategy: Optional[str] = None, # used by decorator - languages: Optional[List[str]] = ["auto"], + languages: Optional[list[str]] = ["auto"], detect_language_per_element: bool = False, date_from_file_object: bool = False, starting_page_number: int = 1, - **kwargs: Any, # used by decorator -) -> List[Element]: + **kwargs: Any, +) -> list[Element]: """Partitions Microsoft Word Documents in .docx format into its document elements. Parameters @@ -203,6 +189,9 @@ def partition_docx( A string defining the target filename path. file A file-like object using "rb" mode --> open(filename, "rb"). + include_page_breaks + When True, add a `PageBreak` element to the element-stream when a page-break is detected in + the document. Note that not all DOCX files include page-break information. infer_table_structure If True, any Table elements that are extracted will also have a metadata field named "text_as_html" where the table's text content is rendered into an html string. @@ -225,23 +214,22 @@ def partition_docx( Applies only when providing file via `file` parameter. If this option is True, attempt infer last_modified metadata from bytes, otherwise set it to None. starting_page_number - Indicates what page number should be assigned to the first page in the document. - This information will be reflected in elements' metadata and can be be especially - useful when partitioning a document that is part of a larger document. + Assign this number to the first page of this document and increment the page number from + there. """ - # -- verify that only one file-specifier argument was provided -- - exactly_one(filename=filename, file=file) - - elements = _DocxPartitioner.iter_document_elements( - filename, - file, - metadata_filename, - include_page_breaks, - infer_table_structure, - metadata_last_modified, - date_from_file_object, + opts = DocxPartitionerOptions( + date_from_file_object=date_from_file_object, + file=file, + file_path=filename, + include_page_breaks=include_page_breaks, + infer_table_structure=infer_table_structure, + metadata_file_path=metadata_filename, + metadata_last_modified=metadata_last_modified, starting_page_number=starting_page_number, ) + + elements = _DocxPartitioner.iter_document_elements(opts) + elements = apply_lang_metadata( elements=elements, languages=languages, @@ -250,56 +238,169 @@ def partition_docx( return list(elements) -class _DocxPartitioner: - """Provides `.partition()` for MS-Word 2007+ (.docx) files.""" +class DocxPartitionerOptions: + """Encapsulates partitioning option validation, computation, and application of defaults.""" def __init__( self, - # -- NOTE(scanny): default values here are unnecessary for production use because - # -- `.iter_document_elements()` is the only interface method and always calls with all - # -- args. However, providing defaults eases unit-testing and decouples unit-tests from - # -- future changes to args. - filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, - metadata_filename: Optional[str] = None, - include_page_breaks: bool = True, - infer_table_structure: bool = True, - metadata_last_modified: Optional[str] = None, - date_from_file_object: bool = False, + *, + date_from_file_object: bool, + file: IO[bytes] | None, + file_path: str | None, + include_page_breaks: bool, + infer_table_structure: bool, + metadata_file_path: Optional[str], + metadata_last_modified: Optional[str], starting_page_number: int = 1, - ) -> None: - self._filename = filename + ): + self._date_from_file_object = date_from_file_object self._file = file - self._metadata_filename = metadata_filename + self._file_path = file_path self._include_page_breaks = include_page_breaks self._infer_table_structure = infer_table_structure + self._metadata_file_path = metadata_file_path self._metadata_last_modified = metadata_last_modified + # -- options object maintains page-number state -- self._page_counter = starting_page_number - self._date_from_file_object = date_from_file_object + + @lazyproperty + def document(self) -> Document: + """The python-docx `Document` object loaded from file or filename.""" + return docx.Document(self._docx_file) + + @lazyproperty + def include_page_breaks(self) -> bool: + """When True, include `PageBreak` elements in element-stream. + + Note that regardless of this setting, page-breaks are detected, and page-number is tracked + and included in element metadata. Only the presence of distinct `PageBreak` elements (which + contain no text) in the element stream is affected. + """ + return self._include_page_breaks + + def increment_page_number(self) -> Iterator[PageBreak]: + """Increment page-number by 1 and generate a PageBreak element if enabled.""" + self._page_counter += 1 + # -- only emit page-breaks when enabled -- + if self._include_page_breaks: + yield PageBreak("", detection_origin=DETECTION_ORIGIN) + + @lazyproperty + def infer_table_structure(self) -> bool: + """True when partitioner should compute and apply `text_as_html` metadata for tables.""" + return self._infer_table_structure + + @lazyproperty + def last_modified(self) -> Optional[str]: + """The best last-modified date available, None if no sources are available.""" + # -- Value explicitly specified by caller takes precedence. This is used for example when + # -- this file was converted from another format, and any last-modified date for the file + # -- would be just now. + if self._metadata_last_modified: + return self._metadata_last_modified + + if self._file_path: + return ( + None + if is_temp_file_path(self._file_path) + else get_last_modified_date(self._file_path) + ) + + if self._file: + return ( + get_last_modified_date_from_file(self._file) + if self._date_from_file_object + else None + ) + + return None + + @lazyproperty + def metadata_file_path(self) -> str | None: + """The best available file-path for this document or `None` if unavailable.""" + return self._metadata_file_path or self._file_path + + @property + def metadata_page_number(self) -> Optional[int]: + """The current page number to report in metadata, or None if we can't really tell. + + Page numbers are not added to element metadata if we can't find any page-breaks in the + document (which may be a common case). + + In the DOCX format, determining page numbers is strictly a best-efforts attempt since + actual page-breaks are determined at rendering time (e.g. printing) based on the + font-metrics of the target device. Explicit (hard) page-breaks are always recorded in the + docx file but the rendered page-breaks are only added optionally. + """ + return self._page_counter if self._document_contains_pagebreaks else None + + @property + def page_number(self) -> int: + """The current page number. + + Note this value may not represent the actual rendered page number when rendered page-break + indicators are not present in the document (not uncommon). Use `.metadata_page_number` for + metadata purposes, which is `None` when rendered page-breaks are not present in this + document. + """ + return self._page_counter + + @lazyproperty + def _document_contains_pagebreaks(self) -> bool: + """True when there is at least one page-break detected in the document. + + Only `w:lastRenderedPageBreak` elements reliably indicate a page-break. These are reliably + inserted by Microsoft Word, but probably don't appear in documents converted into .docx + format from for example .odt format. + """ + xpath = ( + # NOTE(scanny) - w:lastRenderedPageBreak (lrpb) is run (w:r) inner content. `w:r` can + # appear in a paragraph (w:p). w:r can also appear in a hyperlink (w:hyperlink), which + # is w:p inner-content and both of these can occur inside a table-cell as well as the + # document body + "./w:body/w:p/w:r/w:lastRenderedPageBreak" + " | ./w:body/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak" + " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:r/w:lastRenderedPageBreak" + " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak" + ) + + return bool(self.document.element.xpath(xpath)) + + @lazyproperty + def _docx_file(self) -> str | IO[bytes]: + """The Word 2007+ document file to be partitioned. + + This is either a `str` path or a file-like object. `python-docx` accepts either for opening + a document file. + """ + if self._file_path: + return self._file_path + + # -- In Python <3.11 SpooledTemporaryFile does not implement ".seekable" which triggers an + # -- exception when Zipfile tries to open it. The docx format is a zip archive so we need + # -- to work around that bug here. + if isinstance(self._file, tempfile.SpooledTemporaryFile): + self._file.seek(0) + return io.BytesIO(self._file.read()) + + if self._file: + return self._file + + raise ValueError( + "No DOCX document specified, either `filename` or `file` argument must be provided" + ) + + +class _DocxPartitioner: + """Provides `.partition()` for MS-Word 2007+ (.docx) files.""" + + def __init__(self, opts: DocxPartitionerOptions) -> None: + self._opts = opts @classmethod - def iter_document_elements( - cls, - filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, - metadata_filename: Optional[str] = None, - include_page_breaks: bool = True, - infer_table_structure: bool = True, - metadata_last_modified: Optional[str] = None, - date_from_file_object: bool = False, - starting_page_number: int = 1, - ) -> Iterator[Element]: + def iter_document_elements(cls, opts: DocxPartitionerOptions) -> Iterator[Element]: """Partition MS Word documents (.docx format) into its document elements.""" - self = cls( - filename=filename, - file=file, - metadata_filename=metadata_filename, - include_page_breaks=include_page_breaks, - infer_table_structure=infer_table_structure, - metadata_last_modified=metadata_last_modified, - date_from_file_object=date_from_file_object, - starting_page_number=starting_page_number, - ) + self = cls(opts) # NOTE(scanny): It's possible for a Word document to have no sections. In particular, a # Microsoft Teams chat transcript exported to DOCX contains no sections. Such a # "section-less" document has to be interated differently and has no headers or footers and @@ -315,7 +416,7 @@ def _iter_document_elements(self) -> Iterator[Element]: # -- This implementation composes a collection of iterators into a "combined" iterator # -- return value using `yield from`. You can think of the return value as an Element # -- stream and each `yield from` as "add elements found by this function to the stream". - # -- This is functionally analogous to declaring `elements: List[Element] = []` at the top + # -- This is functionally analogous to declaring `elements: list[Element] = []` at the top # -- and using `elements.extend()` for the results of each of the function calls, but is # -- more perfomant, uses less memory (avoids producing and then garbage-collecting all # -- those small lists), is more flexible for later iterator operations like filter, @@ -464,37 +565,7 @@ def iter_row_cells_as_text(row: _Row) -> Iterator[str]: @lazyproperty def _document(self) -> Document: """The python-docx `Document` object loaded from file or filename.""" - filename, file = self._filename, self._file - - if filename is not None: - return docx.Document(filename) - - assert file is not None - if isinstance(file, SpooledTemporaryFile): - file.seek(0) - file = io.BytesIO(file.read()) - return docx.Document(file) - - @lazyproperty - def _document_contains_pagebreaks(self) -> bool: - """True when there is at least one page-break detected in the document. - - Only `w:lastRenderedPageBreak` elements reliably indicate a page-break. These are reliably - inserted by Microsoft Word, but probably don't appear in documents converted into .docx - format from for example .odt format. - """ - xpath = ( - # NOTE(scanny) - w:lastRenderedPageBreak (lrpb) is run (w:r) inner content. `w:r` can - # appear in a paragraph (w:p). w:r can also appear in a hyperlink (w:hyperlink), which - # is w:p inner-content and both of these can occur inside a table-cell as well as the - # document body - "./w:body/w:p/w:r/w:lastRenderedPageBreak" - " | ./w:body/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak" - " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:r/w:lastRenderedPageBreak" - " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak" - ) - - return bool(self._document.element.xpath(xpath)) + return self._opts.document @lazyproperty def _document_contains_sections(self) -> bool: @@ -536,12 +607,6 @@ def iter_hdrftr_texts(hdrftr: _Header | _Footer) -> Iterator[str]: return "\n".join(text for text in iter_hdrftr_texts(hdrftr) if text) - def _increment_page_number(self) -> Iterator[PageBreak]: - """Increment page-number by 1 and generate a PageBreak element if enabled.""" - self._page_counter += 1 - if self._include_page_breaks: - yield PageBreak("", detection_origin=DETECTION_ORIGIN) - def _is_list_item(self, paragraph: Paragraph) -> bool: """True when `paragraph` can be identified as a list-item.""" if is_bulleted_text(paragraph.text): @@ -593,9 +658,9 @@ def iter_paragraph_items(paragraph: Paragraph) -> Iterator[Paragraph | RenderedP if isinstance(item, Paragraph): yield from self._classify_paragraph_to_element(item) else: - yield from self._increment_page_number() + yield from self._opts.increment_page_number() - def _iter_paragraph_emphasis(self, paragraph: Paragraph) -> Iterator[Dict[str, str]]: + def _iter_paragraph_emphasis(self, paragraph: Paragraph) -> Iterator[dict[str, str]]: """Generate e.g. {"text": "MUST", "tag": "b"} for each emphasis in `paragraph`.""" for run in paragraph.runs: text = run.text.strip() if run.text else "" @@ -628,7 +693,7 @@ def iter_footer(footer: _Footer, header_footer_type: str) -> Iterator[Footer]: text=text, detection_origin=DETECTION_ORIGIN, metadata=ElementMetadata( - filename=self._metadata_filename, + filename=self._opts.metadata_file_path, header_footer_type=header_footer_type, category_depth=0, ), @@ -657,7 +722,7 @@ def maybe_iter_header(header: _Header, header_footer_type: str) -> Iterator[Head text=text, detection_origin=DETECTION_ORIGIN, metadata=ElementMetadata( - filename=self._metadata_filename, + filename=self._opts.metadata_file_path, header_footer_type=header_footer_type, category_depth=0, # -- headers are always at the root level} ), @@ -680,7 +745,7 @@ def _iter_section_page_breaks(self, section_idx: int, section: Section) -> Itera """ def page_is_odd() -> bool: - return self._page_counter % 2 == 1 + return self._opts.page_number % 2 == 1 start_type = section.start_type @@ -694,14 +759,14 @@ def page_is_odd() -> bool: # -- on an even page we need two total, add one to supplement the rendered page break # -- to follow. There is no "first-document-page" special case because 1 is odd. if not page_is_odd(): - yield from self._increment_page_number() + yield from self._opts.increment_page_number() elif start_type == WD_SECTION_START.ODD_PAGE: # -- the first page of the document is an implicit "new" odd-page, so no page-break -- if section_idx == 0: return if page_is_odd(): - yield from self._increment_page_number() + yield from self._opts.increment_page_number() # -- otherwise, start-type is one of "continuous", "new-column", or "next-page", none of # -- which need our help to get the page-breaks right. @@ -711,7 +776,9 @@ def _iter_table_element(self, table: DocxTable) -> Iterator[Table]: """Generate zero-or-one Table element for a DOCX `w:tbl` XML element.""" # -- at present, we always generate exactly one Table element, but we might want # -- to skip, for example, an empty table. - html_table = self._convert_table_to_html(table) if self._infer_table_structure else None + html_table = ( + self._convert_table_to_html(table) if self._opts.infer_table_structure else None + ) text_table = " ".join(self._iter_table_texts(table)) emphasized_text_contents, emphasized_text_tags = self._table_emphasis(table) @@ -720,15 +787,15 @@ def _iter_table_element(self, table: DocxTable) -> Iterator[Table]: detection_origin=DETECTION_ORIGIN, metadata=ElementMetadata( text_as_html=html_table, - filename=self._metadata_filename, - page_number=self._page_number, - last_modified=self._last_modified, + filename=self._opts.metadata_file_path, + page_number=self._opts.metadata_page_number, + last_modified=self._opts.last_modified, emphasized_text_contents=emphasized_text_contents or None, emphasized_text_tags=emphasized_text_tags or None, ), ) - def _iter_table_emphasis(self, table: DocxTable) -> Iterator[Dict[str, str]]: + def _iter_table_emphasis(self, table: DocxTable) -> Iterator[dict[str, str]]: """Generate e.g. {"text": "word", "tag": "b"} for each emphasis in `table`.""" for row in table.rows: for cell in row.cells: @@ -765,47 +832,12 @@ def iter_cell_texts(cell: _Cell) -> Iterator[str]: # -- do not generate empty strings -- yield from (text for text in iter_cell_texts(_Cell(tc, table)) if text) - @lazyproperty - def _last_modified(self) -> Optional[str]: - """Last-modified date suitable for use in element metadata.""" - # -- if this file was converted from another format, any last-modified date for the file - # -- will be today, so we get it from the conversion step in `._metadata_last_modified`. - if self._metadata_last_modified: - return self._metadata_last_modified - - file_path, file = self._filename, self._file - - # -- if the file is on the filesystem, get its date from there -- - if file_path is not None: - return None if is_temp_file_path(file_path) else get_last_modified_date(file_path) - - # -- otherwise, as long as user explicitly requested it, try getting it from the file-like - # -- object (unlikely since BytesIO and its brethren have no such metadata). - assert file is not None - if self._date_from_file_object: - return get_last_modified_date_from_file(file) - return None - - @property - def _page_number(self) -> Optional[int]: - """The current page number, or None if we can't really tell. - - Page numbers are not added to element metadata if we can't find any page-breaks in the - document (which may be a common case). - - In the DOCX format, determining page numbers is strictly a best-efforts attempt since actual - page-breaks are determined at rendering time (e.g. printing) based on the fontmetrics of the - target device. Explicit (hard) page-breaks are always recorded in the docx file but the - rendered page-breaks are only added optionally. - """ - return self._page_counter if self._document_contains_pagebreaks else None - - def _paragraph_emphasis(self, paragraph: Paragraph) -> Tuple[List[str], List[str]]: + def _paragraph_emphasis(self, paragraph: Paragraph) -> tuple[list[str], list[str]]: """[contents, tags] pair describing emphasized text in `paragraph`.""" iter_p_emph, iter_p_emph_2 = itertools.tee(self._iter_paragraph_emphasis(paragraph)) return ([e["text"] for e in iter_p_emph], [e["tag"] for e in iter_p_emph_2]) - def _paragraph_link_meta(self, paragraph: Paragraph) -> Tuple[List[str], List[str], List[Link]]: + def _paragraph_link_meta(self, paragraph: Paragraph) -> tuple[list[str], list[str], list[Link]]: """Describes hyperlinks in `paragraph`, if any.""" if not paragraph.hyperlinks: return [], [], [] @@ -854,12 +886,12 @@ def _paragraph_metadata(self, paragraph: Paragraph) -> ElementMetadata: category_depth=category_depth, emphasized_text_contents=emphasized_text_contents or None, emphasized_text_tags=emphasized_text_tags or None, - filename=self._metadata_filename, - last_modified=self._last_modified, + filename=self._opts.metadata_file_path, + last_modified=self._opts.last_modified, link_texts=link_texts or None, link_urls=link_urls or None, links=links or None, - page_number=self._page_number, + page_number=self._opts.metadata_page_number, ) element_metadata.detection_origin = "docx" return element_metadata @@ -977,7 +1009,7 @@ def _style_based_element_type(self, paragraph: Paragraph) -> Optional[Type[Text] # in the mapping. Unknown style names will also return None. return STYLE_TO_ELEMENT_MAPPING.get(style_name) - def _table_emphasis(self, table: DocxTable) -> Tuple[List[str], List[str]]: + def _table_emphasis(self, table: DocxTable) -> tuple[list[str], list[str]]: """[contents, tags] pair describing emphasized text in `table`.""" iter_tbl_emph, iter_tbl_emph_2 = itertools.tee(self._iter_table_emphasis(table)) return ([e["text"] for e in iter_tbl_emph], [e["tag"] for e in iter_tbl_emph_2]) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index fbf364936a..0371866a77 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -6,7 +6,7 @@ import os import re import warnings -from typing import IO, TYPE_CHECKING, Any, Iterator, Optional, Sequence, cast +from typing import IO, TYPE_CHECKING, Any, Iterator, Optional, cast import numpy as np import pdf2image @@ -128,7 +128,6 @@ def partition_pdf( metadata_filename: Optional[str] = None, # used by decorator metadata_last_modified: Optional[str] = None, chunking_strategy: Optional[str] = None, # used by decorator - links: Sequence[Link] = [], hi_res_model_name: Optional[str] = None, extract_images_in_pdf: bool = False, extract_image_block_types: Optional[list[str]] = None, diff --git a/unstructured/partition/ppt.py b/unstructured/partition/ppt.py index e9ada569cf..c2428a7f9a 100644 --- a/unstructured/partition/ppt.py +++ b/unstructured/partition/ppt.py @@ -24,6 +24,7 @@ def partition_ppt( file: Optional[IO[bytes]] = None, include_page_breaks: bool = False, include_metadata: bool = True, + include_slide_notes: Optional[bool] = None, infer_table_structure: bool = True, metadata_filename: Optional[str] = None, metadata_last_modified: Optional[str] = None, @@ -44,6 +45,8 @@ def partition_ppt( A file-like object using "rb" mode --> open(filename, "rb"). include_page_breaks If True, includes a PageBreak element between slides + include_slide_notes + If True, includes the slide notes as element infer_table_structure If True, any Table elements that are extracted will also have a metadata field named "text_as_html" where the table's text content is rendered into an html string. @@ -102,11 +105,13 @@ def partition_ppt( pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx") elements = partition_pptx( filename=pptx_filename, + detect_language_per_element=detect_language_per_element, + include_page_breaks=include_page_breaks, + include_slide_notes=include_slide_notes, infer_table_structure=infer_table_structure, + languages=languages, metadata_filename=metadata_filename, metadata_last_modified=metadata_last_modified or last_modification_date, - languages=languages, - detect_language_per_element=detect_language_per_element, starting_page_number=starting_page_number, ) diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index 58020ea07e..7cc1924fd1 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -92,7 +92,7 @@ def partition_pptx( date_from_file_object: bool = False, detect_language_per_element: bool = False, include_page_breaks: bool = True, - include_slide_notes: bool = False, + include_slide_notes: Optional[bool] = None, infer_table_structure: bool = True, languages: Optional[list[str]] = ["auto"], metadata_filename: Optional[str] = None, @@ -376,7 +376,7 @@ def __init__( file: Optional[IO[bytes]], file_path: Optional[str], include_page_breaks: bool, - include_slide_notes: bool, + include_slide_notes: Optional[bool], infer_table_structure: bool, metadata_file_path: Optional[str], metadata_last_modified: Optional[str], @@ -413,7 +413,7 @@ def include_page_breaks(self) -> bool: @lazyproperty def include_slide_notes(self) -> bool: """When True, also partition any text found in slide notes as part of each slide.""" - return self._include_slide_notes + return False if self._include_slide_notes is None else self._include_slide_notes def increment_page_number(self) -> Iterator[PageBreak]: """Increment page-number by 1 and generate a PageBreak element if enabled.""" diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py index 78d2b63318..96cd105250 100644 --- a/unstructured/partition/text.py +++ b/unstructured/partition/text.py @@ -3,7 +3,7 @@ import copy import re import textwrap -from typing import IO, Any, Callable, Optional +from typing import IO, Any, Callable, Literal, Optional from unstructured.chunking import add_chunking_strategy from unstructured.cleaners.core import ( @@ -49,7 +49,7 @@ def partition_text( file: Optional[IO[bytes]] = None, text: Optional[str] = None, encoding: Optional[str] = None, - paragraph_grouper: Optional[Callable[[str], str]] = None, + paragraph_grouper: Optional[Callable[[str], str]] | Literal[False] = None, metadata_filename: Optional[str] = None, include_metadata: bool = True, languages: Optional[list[str]] = ["auto"], @@ -126,7 +126,7 @@ def _partition_text( file: Optional[IO[bytes]] = None, text: Optional[str] = None, encoding: Optional[str] = None, - paragraph_grouper: Optional[Callable[[str], str]] = None, + paragraph_grouper: Optional[Callable[[str], str]] | Literal[False] = None, metadata_filename: Optional[str] = None, include_metadata: bool = True, languages: Optional[list[str]] = ["auto"], diff --git a/unstructured/utils.py b/unstructured/utils.py index 8ebeb2bca5..84f1c52100 100644 --- a/unstructured/utils.py +++ b/unstructured/utils.py @@ -3,6 +3,7 @@ import functools import html import importlib +import inspect import json import os import platform @@ -33,7 +34,7 @@ from unstructured.__version__ import __version__ if TYPE_CHECKING: - from unstructured.documents.elements import Text + from unstructured.documents.elements import Element, Text # Box format: [x_bottom_left, y_bottom_left, x_top_right, y_top_right] Box: TypeAlias = Tuple[float, float, float, float] @@ -46,6 +47,20 @@ _P = ParamSpec("_P") +def get_call_args_applying_defaults( + func: Callable[_P, List[Element]], + *args: _P.args, + **kwargs: _P.kwargs, +) -> dict[str, Any]: + """Map both explicit and default arguments of decorated func call by param name.""" + sig = inspect.signature(func) + call_args: dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs) + for arg in sig.parameters.values(): + if arg.name not in call_args and arg.default is not arg.empty: + call_args[arg.name] = arg.default + return call_args + + def htmlify_matrix_of_cell_texts(matrix: Sequence[Sequence[str]]) -> str: """Form an HTML table from "rows" and "columns" of `matrix`.