From 6b7fe4469f96c912c4e75da783a070915dd7ce61 Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Fri, 6 Oct 2023 13:41:40 -0500 Subject: [PATCH 1/3] add docs with multiple languages for testing (#1591) ### Summary Closes #1536 Adds .txt documents for testing language detection. --- .../language-docs/UDHR_first_article_all.txt | 1511 +++++++++++++++++ example-docs/language-docs/eng_afr_spa.txt | 5 + example-docs/language-docs/eng_spa.txt | 5 + example-docs/language-docs/eng_spa_mult.txt | 9 + 4 files changed, 1530 insertions(+) create mode 100644 example-docs/language-docs/UDHR_first_article_all.txt create mode 100644 example-docs/language-docs/eng_afr_spa.txt create mode 100644 example-docs/language-docs/eng_spa.txt create mode 100644 example-docs/language-docs/eng_spa_mult.txt diff --git a/example-docs/language-docs/UDHR_first_article_all.txt b/example-docs/language-docs/UDHR_first_article_all.txt new file mode 100644 index 0000000000..e08e98b2b5 --- /dev/null +++ b/example-docs/language-docs/UDHR_first_article_all.txt @@ -0,0 +1,1511 @@ +Universal Declaration of Human Right - First article, all languages + +© 1996 – 2009 The Office of the High Commissioner for Human Rights + +This plain text version prepared by the "UDHR in Unicode" project, +https://www.unicode.org/udhr. + +------ +(Bizisa) +Novdiex nongv liex hufniv dav zer nier, zunxyanr niex qianrlir garhaf hufniv dav zer nier. Gixzex livxinf niex lianrxinx xief, xiongxdif guanxxif nier jinxsenr gof dav duifdaif dor. + +(Jinan) +是人都生而自由,在尊严和权利上一律平等。他们赋有理性和良心,并应以弟兄关系的精神相对待。 + +(Klau) +Chix zox key zifyour, an hu tsunxyanr thungs chianrlif nu phinrten. Tsoxnur nes lishinf thungs leyx o, laiv kuanxshif to tseyr ti cinxsenr shiangxtaif. + +(Maiunan) +Renr rangf lyeuf xif zifyaot, yur zunxyant ndams chwentlif xif pingtdengl. Ter mev lilsingf ndams lyangtsinx, zingxsent gwanxsif vaif nungf. + +(Mijisa) +Novzeu nongv lie kylix dav zeu xi, zunyan nie qianlif gahaf kylix dav zeu xi. Gyxzeu livxinf nie lixtolo ca xie, xiongdif guanxif ai jinsen go duifdaif do. + +(Minjiang, spoken) +Renren sen xialai de si ziyou li, zai zunlian ho quanli sang yelue pinden. Tamen fuyou lixin ho liangxin, hai yingai na xiongdi guanxi li jinsen fuxiang duidai. + +(Minjiang, written) +Renren sen er ziyou, zai zunlian ho quanli sang yelue pinden. Tamen fuyou lixin ho liangxin, bin yin yi xiongdi guanxi li jinsen xiang duidai. + +(Muzzi) +Nia ngir ngir ym mexker bbumlix zifyiyr, zunyanr gix jjuanlif alyf bbumlix zzifsof wur. Garxier lixxinf gix xierfux cor yif, xiongxdif guanxif wur jinsenr la lo rim hix. + +(Yeonbyeon) +사람들이 이 세계로 오다가 모두 자유하고, 존엄과 권리이 평동으로 있는다, 그들 리성과 양심이 있눈고, 형제의 정신으로 상호로 치료하 소. + +Abkhaz +Дарбанзаалак ауаҩы дшоуп ихы дақәиҭны. Ауаа зегь зинлеи патулеи еиҟароуп. Урҭ ирымоуп ахшыҩи аламыси, дара дарагь аешьеи аешьеи реиҧш еизыҟазароуп. + +Aceh +Bandum manusia lahee bebah merdeka deungon hak ngon martabat nyang sama. Ngon akai taseumikee, ngon atee tameurasa bandum geutanyoe lagee syedara. + +Achuar-Shiwiar +Aints ainauti mash metek nuwanmaya akiinawaitji. Turasha angkan pengker pujusmi tusar akiinawaitji. Aintstikia mash ji nintijai paan nintimratnuitji, turasha pengker aa nu nekaatnuitji. Turasha pase aa nusha nekaatnuitji. Turasha ji pataachiri ainaujai pengker nintimtunisar pujaj ina nunisrik chikich ainauj aisha pengker nintimtunisar pujustinuitji. + +Achuar-Shiwiar (1) +Penker inintimsamka mash aintsti ankan, metekrin nuya nii penkerin takakui nii +akiniamunmaya tu ausamti arantukmau atinuitji mai metekrak. + +Adyghe +Цӏыф пстэури шъхьэфитэу, ялъытэныгъэрэ яфэшъуашэхэмрэкӏэ зэфэдэу къалъфы. Акъылрэ зэхэшӏыкӏ гъуазэрэ яӏэшъы, зыр зым зэкъош зэхашІэ азфагу дэлъэу зэфыщытынхэ фае. + +Afar +Karaamat kee garwa wagittaamal seehada inkih gide akkuk, currik taabuke. Usun kas kee cissi loonuuh, keenik mariiy mara lih toobokinni kasat gexsitam faxximta. + +Afrikaans +Alle menslike wesens word vry, met gelyke waardigheid en regte, gebore. Hulle het rede en gewete en behoort in die gees van broederskap teenoor mekaar op te tree. + +Aguaruna +Ashi aents aidauk agkan akinui, betek eme anentsa aentsmasa diyam atanmash, tuja aents anentaibau, aents dutikatasa wakej amu yupichu dutimainnum, tuja ni wakejamun takakush tikish bakushminnum, nuniak tikish aidaujaish shiig yatsuta anmamut ati tusa. + +Aja +Agbetɔwo pleŋu vanɔ gbɛmɛ ko vovoɖeka gbeswɛgbeswɛ, sɔto amɛnyinyi ko acɛwo gomɛ; wo xɔnɔ susunywin ko jimɛnywi so esexwe. Wo ɖo a wɛ nɔvi ɖaɖa wowo nɔnɔwo gbɔ. + +Albanian, Tosk +Të gjithë njerëzit lindin të lirë dhe të barabartë në dinjitet dhe në të drejta. Ata kanë arsye dhe ndërgjegje dhe duhet të sillen ndaj njëri tjetrit me frymë vëllazërimi. + +Alemannisch (Elsassisch) +Àlli Mensche kùmme mìt de gliche Wìrde ùn Rachte ùff d’Walt. Sie hàn àlli Vernùnft ùn Gewìsse ùn selle mìt Brìederlichkeit de àndere gejjenìwwer hàndle. + +Altai, Southern +Ончо улус ак‐јарыкка јайым ла теҥ‐тай тап‐эриктӱ туулат. Олор санааукаалу ла чек кӱӱн‐тапту болуп бӱткен ле бой‐бойын карындаш кирези кӧрӧр лӧ јӱрер учурлу. + +Amahuaca +Tzovan jato jumahaito hinaayamanonmun vacunoxcanquihnucanpu. Tzovan jato zinaayamanonmun vacunoxcanquihnucanpu. Jonitzan derechocavizyahtoxrivimun vacunoxcanquihqui. Quiyoovinin shinanquin hiromaquin jaucuzahavorahquiqui shinantimunhcanquihqui. Vacurazixquicavizhi quiyoovinixjatimunhcanquihnucanpu. + +Amarakaeri +Aya'da aratbut katepi' eka'ta' on'pakpo ka'dik o̱'ne. Nog aratbut huadak o̱'nepo ko̱nigti opudo̱mey huadak mo'e̱. Aya'da huadak eka' nopoe̱'dik o̱'ne kenpa'ti dakhuea' eka' nopoe̱'dik o̱'ne kenpa'ti ko̱nig huama'buytaj o 'tihuapokika' ko̱nigti nogo̱meytaj tihuapokika 'dik o̱'ne. + +Amharic +የሰው፡ልጅ፡ሁሉ፡ሲወለድ፡ነጻና፡በክብርና፡በመብትም፡እኩልነት፡ያለው፡ነው።፡የተፈጥሮ፡ማስተዋልና፡ሕሊና፡ስላለው፡አንዱ፡ሌላውን፡በወንድማማችነት፡መንፈስ፡መመልከት፡ይገባዋል። + +Amis +Chiyu mahufuchay tu tamlaw, maemin pingdeng ichunyan a kngli. Iraay chaira lishing a naay a naay a harateng, pimaulahsha u harateng nu kaka shafa. + +Arabela +Pueyano pa quishacari, puetunu pajaniyajanaa mariyata miishiya maninia, maja sooshiya tamonu. Puetunu pueyajanaari niishitiajaraca, jiuujiaaracanio pueyacua pa taraajenura. Naarate maninia pa jiyanootioore juhua pa tapueyocuaca. + +Arabic, Standard +يولد جميع الناس أحرارًا متساوين في الكرامة والحقوق. وقد وهبوا عقلاً وضميرًا وعليهم أن يعامل بعضهم بعضًا بروح الإخاء. + +Armenian +Բոլոր մարդիկ ծնվում են ազատ ու հավասար իրենց արժանապատվությամբ ու իրավունքներով։ Նրանք ունեն բանականություն ու խիղճ և միմյանց պետք է եղբայրաբար վերաբերվեն։ + +Aromanian +Tuti iatsâli umineshtsâ s-fac liberi shi egali la nâmuzea shi-ndrepturli. Eali suntu hârziti cu fichiri shi sinidisi shi lipseashti un cu alantu sh-si poartâ tu duhlu-a frâtsâljiljei. + +Asháninka +Aquempetavacaajeita maaroni atiri. Timatsi aquenqueshirejeitantari maaroni, timatsi amejeitari, ayojeiti paitarica ocameetsati antajeitiri: te oncameetsateji intsaneapitsajeiteero itsipapee. Te oncameetsateji imperanajeitee, te oncameetsateji iroashinoncaajeitee, irointi ocameetsati aacameetsatavacaajeitea. + +Ashéninka, Pichis +Maaroni atziripayeeni, ovaquera intzimapaaque, eero ocantzi iñaashitacaavaitaityaari iromperanataityaari. Eejatzi oquemitari iroñaaca te apantyaaro amanitashireteri atziri ancanteri: "Te pirjiperote eeroca, iriima irinta iriitaque ñaaperori". Eejatzi oquemitari te oncameethate intzime aparoni atziri antayetashityaarone caari ishinetaacairi pashine irantero. Tema maaroni ayotziro ampampithashirvaayeta, ayotziro tsicarica otzimayetzi cameethatatsiri anteri o tsicarica otzimi caariperotatsiri, irootaque ocovaperotantari iroñaaca entacotavacaayetya anquemitacaantanaquero arentzitavacaatyeeyaami ocaaquiini. + +Assyrian Neo-Aramaic +ܟܠ ܒܪܢܫܐ ܒܪܝܠܗ ܚܐܪܐ ܘܒܪܒܪ ܓܘ ܐܝܩܪܐ ܘܙܕܩܐ. ܘܦܝܫܝܠܗ ܝܗܒܐ ܗܘܢܐ ܘܐܢܝܬ. ܒܘܕ ܕܐܗܐ ܓܫܩܬܝ ܥܠ ܐܚܪܢܐ ܓܪܓ ܗܘܝܐ ܒܚܕ ܪܘܚܐ ܕܐܚܢܘܬܐ. + +Asturian +Tolos seres humanos nacen llibres y iguales en dignidá y drechos y, pola mor de la razón y la conciencia de so, han comportase hermaniblemente los unos colos otros. + +Awa-Cuaiquer + + +Aymara, Central +Taqpach jaqejh khuskat uñjatatäpjhewa munañapansa, lurañapansa, amuyasiñapansa, ukatwa jilani sullkanípjhaspas ukham uñjasipjhañapawa. + +Azerbaijani, North (Cyrillic) +Бүтүн инсанлар ләјагәт вә һүгугларына ҝөрә азад вә бәрабәр доғулурлар. Онларын шүурлары вә виҹданлары вар вә бир-бирләринә мүнасибәтдә гардашлыг руһунда давранмалыдырлар. + +Azerbaijani, North (Latin) +Bütün insanlar ləyaqət və hüquqlarına görə azad və bərabər doğulurlar. Onların şüurları və vicdanları var və bir-birlərinə münasibətdə qardaşlıq ruhunda davranmalıdırlar. + +Baatonum +Ba tɔmbu kpuro marawa ba tii mɔ, ba nɛ, girima ka saria sɔɔ. Ba ra bwisiku, ba dasabu mɔ, ma n weene ba n waasinɛ mɛrobisiru sɔɔ. + +Bali +Sami manusane sane nyruwadi wantah merdeka tur maduwe kautamaan lan hak-hak sane pateh. Sami kalugrain papineh lan idep tur mangdane pada masawitra melarapan semangat pakulawargaan. + +Bamanankan +Hadamaden bɛɛ danmakɛɲɛnen bɛ bange, danbe ni josira la. Hakili ni taasi b’u bɛɛ la, wa u ka kan ka badenɲasira de waleya u ni ɲɔgɔn cɛ. + +Bamun +Pe nâ mvé gú puen nyütu pô te mbe kú ghét ngam pua ngúenengúe mbe te wûme nsebe pua pa mféékêt. Pen â ntúm te mbe kú rem ngam pua fabshe ngam, a nshi njîr’ap ne yi nshâne ngétne nga shap pô te wupme pontâ. + +Baoulé +Sran mun be ngba, kɛ be wu be ɔ, be ngba be sɛ, fɔndi nun, sran-mmala nun. Be si akundanbu, be si su ɔ fata kɛ sran mun be tran'n, be tran aniaan nun tranlɛ. + +Bari +Ŋutu liŋ a yuŋwe kana, jojo i toďiri ko ďekesi ko ti se tokitaki ko ‘börik ko mulökötyo lo toluŋaseran. Se a ďoka ko denet na kulya na’but ko narok. + +Basque +Gizon-emakume guztiak aske jaiotzen dira, duintasun eta eskubide berberak dituztela; eta ezaguera eta kontzientzia dutenez gero, elkarren artean senide legez jokatu beharra dute. + +Belarusan +Усе людзі нараджаюцца свабоднымі і роўнымі ў сваёй годнасці і правах. Яны надзелены розумам і сумленнем і павінны ставіцца адзін да аднаго ў духу брацтва. + +Bemba +Abantu bonse bafyalwa abalubuka nokulingana mu mucinshi nensambu. Balikwata amano nokutontonkanya, eico bafwile ukulacita ifintu ku banabo mu mutima wa bwananyina. + +Bengali +সমস্ত মানুষ স্বাধীনভাবে সমান মর্যাদা এবং অধিকার নিয়ে জন্মগ্রহণ করে। তাঁদের বিবেক এবং বুদ্ধি আছে; সুতরাং সকলেরই একে অপরের প্রতি ভ্রাতৃত্বসুলভ মনোভাব নিয়ে আচরণ করা উচিত। + +Bhojpuri +सबहि लोकानि आजादे जम्मेला आओर ओखिनियो के बराबर सम्मान आओर अधिकार प्राप्त हवे। ओखिनियो के पास समझ-बूझ आओर अंत:करण के आवाज होखता आओर हुनको के दोसरा के साथ भाईचारा के बेवहार करे के होखला। + +Bicolano, Central +An gabos na tawo ipinangaking may katalinkasan asin parantay sa dignidad asin derechos. Sinda gabos tinawan nin pag-isip asin conciencia kaya dapat na makipag-iriba sa lambang saro bilang mga magturugang. + +Bislama +Evri man mo woman i bon fri mo ikwol long respek mo ol raet. Oli gat risen mo tingting mo oli mas tritim wanwan long olgeta olsem ol brata mo sista. + +Bora +Pámeere ííñújɨri meíjcyame tsá múhójɨ́sɨ́ pañé ɨ́cubáhrádú meíjcyáítyuróne. Pámeere tsahdúré imí meíjcyame mewájyújcatsíñe mépɨ́áábójcatsíiyá tsaatéké éhdɨ́Ȉ́válletúmé éhne múu mépañétúéné nahbémuma meíjcyadu. + +Bosnian (Cyrillic) +Сва људска бића раћају се слободна и једнака у достојанству и правима. Она су обдарена разумом и свијешћу и треба да једно према другоме поступају у духу братства. + +Bosnian (Latin) +Sva ljudska bića rađaju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i sviješću i treba da jedno prema drugome postupaju u duhu bratstva. + +Breton +Dieub ha par en o dellezegezh hag o gwirioù eo ganet an holl dud. Poell ha skiant zo dezho ha dleout a reont bevañ an eil gant egile en ur spered a genvreudeuriezh. + +Bugis +Sininna rupa tau ri jajiangngi rilinoe nappunnai manengngi riasengnge alebbireng . Nappunai riasengnge akkaleng, nappunai riasengnge ati marennni na sibole bolena pada sipakatau pada massalasureng. + +Bulgarian +Всички хора се раждат свободни и равни по достойнство и права. Те са надарени с разум и съвест и следва да се отнасят помежду си в дух на братство. + +Bulu +Abiali bod bese, tege ai sesala, bene etie dzia a mis memvende y'enyiñ, dzom dzia etu fili nkóbó, fili ntsogan, fili mboan. Ve abiali te, mod ose ayem dze ene abe, dze ene mbeñ asu e mod mbog antoa ai mfi na enyiñ ewulu mezen mene sosoo. + +Burmese +လူတိုင်းသည် တူညီ လွတ်လပ်သော ဂုဏ်သိက္ခာဖြင့် လည်းကောင်း၊ တူညီလွတ်လပ်သော အခွင့်အရေးများဖြင့် လည်းကောင်း၊ မွေးဖွားလာသူများ ဖြစ်သည်။ ထိုသူတို့၌ ပိုင်းခြား ဝေဖန်တတ်သော ဉာဏ်နှင့် ကျင့်ဝတ် သိတတ်သော စိတ်တို့ရှိကြ၍ ထိုသူတို့သည် အချင်းချင်း မေတ္တာထား၍ ဆက်ဆံကျင့်သုံးသင့်၏။ + +Bushi +Ɓinadamu djabi nitirahinyi an-nafasi, reu bokeu miraŋa amin’ni usheu ndreka haki. Reu teraka ndreka ãkili ndreka hikima, amin’ni zenyi, reu nikulazimu nisi tweraŋa nin-fihavaŋa reu sambi reu. + +Candoshi-Shapra +Iy tpotsini ichigoroni kis tamam zadkini, vatam tpotsiniva. Vatam ichigoroni magini tarova; ashiriya chinakaniya. Ashirocha, zovalliatsich tamaparia-ashiros sanpata chinagtsa atiniya. + +Caquinte +Aquejetavacaajiaca maasano caquinte. Chooca aquenquejantaca maasano, chooca amejigaca, atsajiaque taaca opajitapae ocameetsataque antajiguica. Tee oncameetsateji iromperaperanajicaji, tee oncameetsateji irogashinoncajajiacaji. Jero cameetsatatsica aavacaj aiaquempa. + +Cashibo-Cacataibo +Ui uni cara 'iti icë axbi ca bëtsi unibë gobiernonën iscëx sënën ití icën. Ui cara ain tita ain papa 'iaxa quixun sinanquinma ca gobiernonën sinancëx ax bëtsibë sënën 'icën. Camaxunbi ca sinanti 'unanin. Camaxunbi ca añu ñu ati cara asábi 'icën, añu ñu 'ati cara 'aisama 'icë quixun 'unanti 'icën. Usa 'ain ca camaxbi ain xucënbë 'icësaribiti nuiananti 'icën. + +Cashinahua +Yudabu dasibi jabiaskadi akin, xinantidubuki. Javen taea jau jaibunamenunbunven. + +Catalan-Valencian-Balear +Tots els éssers humans neixen lliures i iguals en dignitat i en drets. Són dotats de raó i de consciència, i han de comportar-se fraternalment els uns amb els altres. + +Cebuano +Ang tanang katawhan gipakatawo nga may kagawasan ug managsama sa kabililhon. Sila gigasahan sa salabutan ug tanlag og mag-ilhanay isip managsoon sa usa'g-usa diha sa diwa sa ospiritu. + +Chachi +Naaju chachilla bain mu' chachilla bain na kayatu tichiba bulla jutyu naakendya'ba kenu deechu taa na kayamu deju, tsenminya,naaju juñu bain ne tsaave ti', uukavinu jutyu naa, tideechu juuchi bain, mubain mubain tsaren dejuve, tsenmin shilli pensangenu pude deju'. mitya, tsenr)1in ura' kendu bain ura' kendyu' bain mide' mitya muba mu bain veta' veta' ura' keewaawaa kenuu dejuve. + +Chakma + 𑄝𑄬𑄇𑄴 𑄟𑄚𑄪𑄌𑄴 𑄚𑄨𑄢𑄨𑄞𑄨𑄣𑄨 𑄥𑄧𑄁 𑄃𑄨𑄌𑄴𑄎𑄮𑄖𑄴 𑄃𑄳𑄃 𑄃𑄇𑄴𑄇𑄥𑄁 𑄚𑄨𑄚𑄬𑄭 𑄎𑄧𑄚𑄴𑄟𑄚𑄴𑅁 𑄖𑄢𑄢𑄴 𑄃𑄬𑄘 𑄃𑄳𑄃 𑄝𑄪𑄖𑄴𑄙𑄨 𑄃𑄊𑄬; 𑄥𑄬𑄚𑄧𑄖𑄳𑄠𑄴 𑄝𑄬𑄇𑄴𑄅𑄚𑄧𑄢𑄴 𑄃𑄬𑄇𑄴𑄎𑄧𑄚𑄴 𑄃𑄢𑄬𑄇𑄴 𑄎𑄧𑄚𑄧𑄢𑄴 𑄛𑄳𑄢𑄧𑄖𑄨 𑄉𑄧𑄟𑄴 𑄘𑄮𑄣𑄴 𑄌𑄨𑄘𑄳𑄠𑄬 𑄚𑄨𑄚𑄬𑄭 𑄌𑄧𑄣𑄚 𑄅𑄪𑄌𑄨𑄖𑄴𑅁 + +Chamorro +Todo taotao siha man mafanago libertao yan pareho gi dignidad yan derecho siha, man manae siha hinaso yan consiencia yan debe de ufatinas contra uno yan otro gi un espiritun chumelo. + +Chayahuita +Ya'ipi piyapinpoa' capini noya ninosorocaso' ya'huërin. Ya'ipinpoa' yonquirëhua'. Noya nicacaso' nitotërëhua'. Napoaton iyanpoa pochin ninosorocaso' ya 'huërin. + +Cherokee (cased) +Ꮒꭶꮣ ꭰꮒᏼꮻ ꭴꮎꮥꮕꭲ ꭴꮎꮪꮣꮄꮣ ꭰꮄ ꭱꮷꮃꭽꮙ ꮎꭲ ꭰꮲꮙꮩꮧ ꭰꮄ ꭴꮒꮂ ꭲᏻꮎꮫꮧꭲ. Ꮎꮝꭹꮎꮓ ꭴꮅꮝꭺꮈꮤꮕꭹ ꭴꮰꮿꮝꮧ ꮕᏸꮅꮫꭹ ꭰꮄ ꭰꮣꮕꮦꮯꮣꮝꮧ ꭰꮄ ꭱꮅꮝꮧ ꮟᏼꮻꭽ ꮒꮪꮎꮣꮫꮎꮥꭼꭹ ꮎ ꮧꮎꮣꮕꮯ ꭰꮣꮕꮩ ꭼꮧ. + +Cherokee (uppercase) +ᏂᎦᏓ ᎠᏂᏴᏫ ᎤᎾᏕᏅᎢ ᎤᎾᏚᏓᎴᏓ ᎠᎴ ᎡᏧᎳᎭᏉ ᎾᎢ ᎠᏢᏉᏙᏗ ᎠᎴ ᎤᏂᎲ ᎢᏳᎾᏛᏗᎢ. ᎾᏍᎩᎾᏃ ᎤᎵᏍᎪᎸᏔᏅᎩ ᎤᏠᏯᏍᏗ ᏅᏰᎵᏛᎩ ᎠᎴ ᎠᏓᏅᏖᏟᏓᏍᏗ ᎠᎴ ᎡᎵᏍᏗ ᏏᏴᏫᎭ ᏂᏚᎾᏓᏛᎾᏕᎬᎩ Ꮎ ᏗᎾᏓᏅᏟ ᎠᏓᏅᏙ ᎬᏗ. + +Chickasaw +Himmaka' nittakookano hattak yokasht toksalicha'nikat ki'yo. Hattak mó̱makat ittíllawwi bíyyi'kacha nanna mó̱maka̱ ittibaachaffa'hitok. + +Chin, Falam +Mikip in bangrep ii zalen nak le sunlawih nak thawn, bangrep in covo nei in, asuak mi kan si. Anmah in hleidan thei nak fim nak le nuncan neih thei nak ruah nak nei ii, pakhat le pakhat duh dawt nak, pawl awk nak nei ding asi. + +Chin, Haka +Mi vialte hna cu zalong te, ai tluk te le upat tihzah awktlak le thiltikhawhnak tinvo a ngei in a chuak mi kan si dih. Minung cu a chia a tha thleidang khomi ruahnak le theihthiamnak ziaza tha a ngeimi kan si caah pakhat le pakhat dawtnak he i pehtlaihnak le i hawikawmhnak ngeih ding kan si. + +Chin, Matu +Thlangboeih he rhimomna, vanpitna, yalpona hamhmoel ka tawn thlang la cuun la ng’om u. Thlanghing he athae-then paekboe thaina neh yakming thaina moeiboe ka tawn thlang la n’om u dong ah khat neh khat lungvat na neh thloehlan voekhlak u thae ham om. + +Chin, Tedim +Mihingte khempeuh in thupitak leh thuneihna tawh suakta tak leh akibang in kipiang ciat ahi hi. Asia leh apha khentel thei thungaihsutna nei ciat uh ahihman in khat leh khat sanggam unau lungsim tawh kithuah khop ding hi. + +Chinantec, Chiltepec +Lejɨ̈ ni sou tsa lisia̱ ija̱a sia ikou' ne kojo̱ jï ne juso̱ ne jmo' re ju i sɨ' jmo' nö sala̱ ne sasno. + +Chinantec, Ojitlán +La juu dsa lu siä –Dsa kö ñi ba dsa, nía kö ni' ba na lu' dsa e dsa tï é li jnia' roö'. + +Chinese, Gan +人人生而自由,在志向跟权利上一律平等。渠们赋有理性跟良心,并理当以弟兄义气相对待。 + +Chinese, Hakka +人人生而自由,在尊严同权利上一律平等。佢丁人赋有理性同好心田,并应以兄弟关系个精神相对待。 + +Chinese, Jinyu +人人生而自由,在尊严和权利上一律平等。他们赋有理性和良心,并应以弟兄关系的精神相对待。 + +Chinese, Mandarin (Beijing) +人人生而自由,挨尊严和权利上一刬平等。他们趁理性和良心,并应以一个座儿的精神相对待。 + +Chinese, Mandarin (Guiyang) +人人生而自由,在尊严和权利上一律是平等的。他们赋有理性和良心,并应以兄弟关系的精神相互对待。 + +Chinese, Mandarin (Harbin) +人人生而自由,在尊严和权利之上一律平等。他们赋有理性和良心,并应以哥们弟兄的精神相对待。 + +Chinese, Mandarin (Nanjing) +大家生而自由,在尊严告权利上头一律平等。他们赋有理性告良心,并该派以兄弟关系的精神相对待。 + +Chinese, Mandarin (Simplified) +人人生而自由,在尊严和权利上一律平等。他们赋有理性和良心,并应以兄弟关系的精神相对待。 + +Chinese, Mandarin (Tianjin) +人个顶个生而自由,在尊严和权利上般儿般儿大。他们趁理性和良心,并应以兄弟关系的精神相对待。 + +Chinese, Mandarin (Traditional) +人人生而自由,在尊嚴和權利上一律平等。他們賦有理性和良心,並應以兄弟關係的精神相對待。 + +Chinese, Min Nan +人人生而自由,在尊严合权利上一律平等。因赋有脾胃合道行,并着以兄弟关系的精神相对待。 + +Chinese, Wu +人人生而自由,拉尊严脱仔权利上一律平等。伊拉有理性脱仔良心,并应以兄弟关系个精神相对待。 + +Chinese, Xiang +人人生而自由,在尊严和权利上一律平等。他们赋有理性和良心,在得以兄弟关系的精神相对待。 + +Chinese, Yue +人人生而平等,喺尊严同埋权利上一律平等。佢哋有理性同埋良心,而且应当以兄弟关系嘅精神相对待。 + +Chokwe +Mwese yoze masemuka katela ukulungunga ulengunga ulemu nyi vumbi eswe ci mwikha. Eswe kalingile kupwa nyi usambe nyi mangana nyi kuhasa kulimika nyumwe nyi mukwo nyi kulita nyi mbunge ya ulemu wa utu. + +Chokwe (Angola) +Athu eswe kakusemuka ngwe akwo, ku vumbi nyi hakusakula.Kali nyi mana,mba mahasa kulinga umwu hali mukwo nyi espiritu ya kuli kuasa. + +Chuukese +Esap wor och mettoch epwe appeti aramas seni fansoun ar uputiu non ar tufich me rait. Ar ekiek epwe mecheres o esap pet ren och sakkun mettoch pun ir repwe nonnomfengen non kinamwe o pwipwi annim. + +Chuvash +Пур халӑх та уйрӑм пурӑнма пӗр тан праваллӑ. Ҫак правапа усӑ курса вӗсем хӑйсен политика статусне ирӗклӗн туса хураҫҫӗ, экономика, общество тата культура енӗпе ирӗклӗн аталанаҫҫӗ. Патшалӑхсен ҫак правӑна хисеплемелле, территори пӗр пӗтӗмлӗхӗн принципӗ унпа усӑ курма пӗр енлӗн чарса тӑракан чӑрмав пулмалла мар. + +Colorado +Piyanle tsa'chila, mankarijun, junshi manta tan, in tobi jaminlajoyoe, titi mi, tenka kano min, junshi, tsa'chila tala, sen jono min. + +Comorian, Maore +Wanadamu piya udzalwa huru tsena sawa ha ufahari na ha haki. Na wawo wana ãkili na hisi, esa ilazimu wadzivhinge na wanyao ha fikira ya unanya. + +Comorian, Ngazidja +Wo wanadamu piya wo uzalwa na uhuriya na usawa waki undru na uhaki. Wo upwawa ankili na urambuzi hayizo yilazimu warwaliyane hazitrendwa na fikira zaki unanya. + +Corsican +Nascinu tutti l’omi libari è pari di dignità è di diritti. Pussedinu a raghjoni è a cuscenza è li tocca ad agiscia trà elli di modu fraternu. + +Cree, Swampy +ᒥᓯᐌ ᐃᓂᓂᐤ ᑎᐯᓂᒥᑎᓱᐎᓂᐠ ᐁᔑ ᓂᑕᐎᑭᐟ ᓀᐢᑕ ᐯᔭᑾᐣ ᑭᒋ ᐃᔑ ᑲᓇᐗᐸᒥᑯᐎᓯᐟ ᑭᐢᑌᓂᒥᑎᓱᐎᓂᐠ ᓀᐢᑕ ᒥᓂᑯᐎᓯᐎᓇ᙮ ᐁ ᐸᑭᑎᓇᒪᒋᐠ ᑲᑫᑕᐌᓂᑕᒧᐎᓂᓂᐤ ᓀᐢᑕ ᒥᑐᓀᓂᒋᑲᓂᓂᐤ ᓀᐢᑕ ᐎᒋᑴᓯᑐᐎᓂᐠ ᑭᒋ ᐃᔑ ᑲᓇᐗᐸᒥᑐᒋᐠ᙮ + +Crimean Tatar +Bütün insanlar serbestlik, menlik ve uquqlarda musaviy olıp dünyağa keleler. Olar aqıl ve vicdan saibidirler ve biri-birilerinen qardaşçasına munasebette bulunmalıdırlar + +Crioulo, Upper Guinea +Tudu pekaduris ta padidu libri i igual na balur suma na diritus. Suma e dadu kapasidadi di pensa, e tene tambi konsiensia, e dibi di trata ñutru suma ermons. + +Crioulo, Upper Guinea (008) +Tudu pecadur padidu livre, ninguin ca más ninguin, tudu djusta, tudu tem mesmu diritu. Tudu quin qui padidu, tem si roçon, cu si manera di pensa. Na metadi di utrus I díbidi fassi cussas cu ermondadi. + +Croatian +Sva ljudska bića rađaju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i sviješću i treba da jedno prema drugome postupaju u duhu bratstva. + +Czech +Všichni lidé rodí se svobodní a sobě rovní co do důstojnosti a práv. Jsou nadáni rozumem a svědomím a mají spolu jednat v duchu bratrství. + +Dagaare, Southern +Nengsaala zaa ba nang dɔge so la o menga, ka o ne o taaba zaa sengtaa noba emmo ane yɛlɛsoobo sobic poɔ. Ba dɔgɛɛ ba zaa ne yɛng ane yɛlɛ-iruu k'a da seng ka ba erɛ yɛlɛ korɔ taa a nga yɔɔmine. + +Dagbani +Sal' la sala. Bɛhig' be sokam sanimi, din pa la amii. Suhizɔbo be sokam sani; ka nambɔɣu beni. Suhubɔhibo mi bi lan kɔŋ yigunaadam kam sani. Dinzuɣu dimbɔŋɔ zaa wuhiya ka dama di tu kamaata ka ti zaa yu tab' hali ni ti puuni. + +Dangme +Adesahi tsuo ɔ, a bɔ mɛ nɛ nɔ fɛɛ nɔ e ye e he, nɛ nɔ tsuaa nɔsɔ ngɛ odehe si himi kɛ he blɔhi a blɔ fa mi. A bɔ mɛ kɛ nɔ́ se kɔmi kɛ he nule juɛmi, nɛ e hia kaa nɔ fɛɛ nɔ nɛ e na nyɛmi suɔmi kɛ ha nɔ tsuaa nɔ. + +Danish +Alle mennesker er født frie og lige i værdighed og rettigheder. De er udstyret med fornuft og samvittighed, og de bør handle mod hverandre i en broderskabets ånd. + +Dari +تمام افراد بشر آزاد به دنیا می‌آیند و از لحاظ حیثیت و حقوق با هم برابرند. همه دارای عقل و وجدان هستند و باید نسبت به یکدیگر با روح برادری رفتار کنند. + +Dendi +Aduniya kuna n gu ibuna damayo hɛi nɔ dei-dei nn daama nna n burucinitɛrɛ fɔ, n lasabu nna laakari ya nam nn mɔ huro cɛrɛ kuna nyanze tɛrɛ bɔŋɔɔ. + +Dinka, Northeastern +Raan thök eben aye dhëëth ka lau nhöm kua thöŋ nhiim eyithiic, kua thɛ̈kic, kua ci yëknhiethku puou, ku bik cëŋ ka ke ye mith etik. + +Ditammari +Oniti ti pɛi nɖɛ omɔũ yi kpaatri otɔu, kɛ yɛ̃ oniti ba we, o yi ɖo nnɛ fɛhɔ̃fɛ; o mɔkɛmu mɛcii kɛhã mɛyɛmmɛ. Ti tú nɛ ɖo kenyari ti tɔbɛ mbɛ kɛ yie mii ba nkwuɔ ko otɔu ɖau. + +Drung +Avzangf max pyvccuf byv syvnax zyxyyv ef, lifxingx ningx lyangvxinx alf, taixrav angvnikxrav gwanxxix mix syv av duixdaix. + +Dutch +Alle mensen worden vrij en gelijk in waardigheid en rechten geboren. Zij zijn begiftigd met verstand en geweten, en behoren zich jegens elkander in een geest van broederschap te gedragen. + +Dzongkha +འགྲོ་བ་མི་ཚུ་ག་ར་དལ་དབང་གི་ཐོག་ལས་སྐྱེས་ཏེ་ཡོདཔ་ལས་ ག་ར་ལུ་བརྩི་མཐོང་དང་ཐོབ་དབང་འདྲ་མཉམ་སྦེ་ཡོད། མི་ཚུ་ག་ར་སྨྲ་ཤེས་དོན་གོ་བའི་མཚན་ཉིད་དང་ལྡནམ་ལས་ ག་ར་གིས་ལཱ་ག་ཅི་ར་འབད་རུང་ གཅིག་གིས་གཅིག་ལུ་སྤུན་ཆའི་འདུ་ཤེས་བསྐྱེད་ཐོག་ལས་ལཱ་འབད་དགོ། + +Edo +Emwan ne agbon hia ne a biere, a bie iran noyan-egbe iran kevbe wee, umwon-mwen o ree etin hia ne o kheke iran khin. A ye ewaen kevbe ekhoe ne o maa wu iran, ne iran gha yin da egbe vbe orhion oghe eten-okpa. + +English +All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. + +Ese Ejja +Ojjaña esejja ojjaña oyaja yojjaya cuayani quiapame oyajayojjaya quiapame ojjaña eseya quiapame quia tai jjashauabataiquiani ecueya epejji jayo jjaya ojjaña jajji ojjañajaassi eseyajayojja. + +Esperanto +Ĉiuj homoj estas denaske liberaj kaj egalaj laŭ digno kaj rajtoj. Ili posedas racion kaj konsciencon, kaj devus konduti unu al alia en spirito de frateco. + +Estonian +Kõik inimesed sünnivad vabadena ja võrdsetena oma väärikuselt ja õigustelt. Neile on antud mõistus ja südametunnistus ja nende suhtumist üksteisesse peab kandma vendluse vaim. + +Even +Бэйил бокэтчур омэн хилкич нян урумкэр балдаритно, теми ноҥардук эгдьэн ҥи‐да ачча. Бэйил бөкэтчур мэн долан акагчимур биннэтын. + +Evenki +Упкат илэл ты̄нмукирди, урэ̄лди мэ̄нңи са̄рича̄ди балдыдяра. Нуңартын дялитви, һалдяндыви биси, мэмэгӣлвэр аяралды̄дяна тэдет о̄мамачитын. + +Éwé +Wodzi amegbetɔwo katã ablɔɖeviwoe eye wodzena bubu kple gomekpɔkpɔ sɔsɔe. Susu kple dzitsinya le wo dometɔ ɖesiaɖe si eyata wodze be woanɔ anyi le ɖekawɔwɔ blibo me. + +Fante +Wɔwo adasa nyina to fahodzi mu, na hɔn nyina yɛ pɛr wɔ enyimnyam na ndzinoa mu. Wɔmaa hɔn nyina adwen na tsibowa, na ɔwɔ dɛ hɔn nkitahodzi mu ndzeyɛɛ da no edzi dɛ wɔyɛ enuanom. + +Faroese +Øll menniskju eru fødd fræls og jøvn til virðingar og mannarættindi. Tey hava skil og samvitsku og eiga at fara hvørt um annað í bróðuranda. + +Farsi, Western +تمام افراد بشر آزاد بدنیا میایند و از لحاظ حیثیت و حقوق با هم برابرند. همه دارای عقل و وجدان میباشند و باید نسبت بیکدیگر با روح برادری رفتار کنند. + +Fijian +Era sucu ena galala na tamata yadua, era tautauvata ena nodra dokai kei na nodra dodonu. E tiko na nodra vakasama kei na nodra lewaeloma, sa dodonu mera veidokadokai ena yalo ni veitacini. + +Finnish +Kaikki ihmiset syntyvät vapaina ja tasavertaisina arvoltaan ja oikeuksiltaan. Heille on annettu järki ja omatunto, ja heidän on toimittava toisiaan kohtaan veljeyden hengessä. + +Finnish, Kven +Kaikki ihmiset synnythään vaphaina, ja heilä kaikila oon sama ihmisarvo ja samat ihmisoikkeuet. Het oon saanheet järjen ja omatunnon, ja het piethään elläät toinen toisen kans niin ko veljet keskenhään. + +Fon +Acɛ, susu kpo sisi ɖokpo ɔ kpo wɛ gbɛtɔ bi ɖo ɖò gbɛwiwa tɔn hwenu; ye ɖo linkpɔn bɔ ayi yetɔn mɛ kpe lo bɔ ye ɖo na do alɔ yeɖee ɖi nɔvinɔvi ɖɔhun. + +French +Tous les êtres humains naissent libres et égaux en dignité et en droits. Ils sont doués de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternité. + +Frisian, Western +Alle minsken wurde frij en gelyk yn weardigens en rjochten berne. Hja hawwe ferstân en gewisse meikrigen en hearre har foar inoar oer yn in geast fan bruorskip te hâlden en te dragen. + +Friulian +Ducj i oms a nassin libars e compagns come dignitât e derits. A an sintiment e cussience e bisugne che si tratin un culaltri come fradis. + +Fulfulde, Nigerian +Innama aadeeji fof poti, ndimɗidi e jibinannde to bannge hakkeeji. Eɓe ngoodi miijo e hakkilantaagal ete eɓe poti huufo ndirde e nder ɓ iynguyummaagu. + +Fulfulde, Nigerian (2) +Ɓi-aadama fuu dimo danyete/jibinte o fotan be koomoye e neɗɗaaku be hakkeeji. ɓe ndokkaaɓe hakkiilo ngaandi nden bo ɓe kuutindiray hakkunde maɓɓe nder yiɗyiɗɗirki mbandiraagu. + +Fur +kwa-sí nyéttiŋ baajtólá kereli nás nisila na ta̱gɨdɨŋ arrá ka̱ɨŋ, Naŋ-sí ugola na kilmaŋá arrá ka̱ɨŋ namá in lóŋ áláŋ sǔrŋâ-sí kí jaiŋa in kéél ná sǔrŋâ suurꞌíŋ bârŋa. + +Ga +Afɔ gbɔmɔ fɛɛ gbɔmɔ yɛ agbojee mli, kɛ hegbɛ ko ni damɔ ŋɛlɛ koome nɔ. Gbɔmɛi fɛɛ yɛ jwɛŋmɔ kɛ henilee, ni no hewɔ lɛ esa akɛ amɛhe ahi shi yɛ nyɛmi suɔmɔ mli. + +Gaelic, Irish +Saoláitear na daoine uile saor agus comhionann ina ndínit agus ina gcearta. Tá bauidh an réasúin agus an choinsiasa acu agus dlíd iad féin d'iompar de mheon bhrthreachais i leith a chéile. + +Gaelic, Scottish +Tha gach uile dhuine air a bhreth saor agus co-ionnan ann an urram 's ann an còirichean. Tha iad air am breth le reusan is le cogais agus mar sin bu chòir dhaibh a bhith beò nam measg fhein ann an spiorad bràthaireil, + +Gagauz +Insannar hepsi duuêrlar serbest hem birtakım kendi kıymetindä hem haklarında. Onnara verilmiş akıl hem üz da läazım biri-birinä davransınnar kardaşlık ruhuna uygun. + +Galician +Tódolos seres humanos nacen libres e iguais en dignidade e dereitos e, dotados como están de razón e conciencia, díbense comportar fraternalmente uns cos outros. + +Ganda +Abantu bazaalibwa nga balina eddembe n'obuyinza ebyenkanankana, batondebwa nga balina amagezi era nga basobola okwawula ekirungi n'ekibi bwebatyo, buli omu agwana okuyisa munne nga muganda we. + +Garifuna +Sun gürigia nasíruati yuti lun, lidan úarani, lawiwanduní libágari kai le aubai labúsienra, gatu giñe lanagun lungua buidu hadan líbegu. + +Gen +Agbetɔwo kpata le jijimɛa, ɖo vosinɔnɔ, nyi gbèsɔɛ́mɛ́wó le nujɔnunnyi ku goɖoejisewo, amɛbusewo mɛ. Tagbɔ le woa si, eye wɔnawo sɔdoda woanɔnɔwo gbɔa la nyi nɔ́visilélé. + +Georgian +ყოველი ადამიანი იბადება თავისუფალი და თანასწორი თავისი ღირსებითა და უფლებებით. მათ მინიჭებული აქვთ გონება და სინდისი და ერთმანეთის მიმართ უნდა ექცეოდნენ ძმობის სულისკვეთებით. + +German, Standard (1901) +Alle Menschen sind frei und gleich an Würde und Rechten geboren. Sie sind mit Vernunft und Gewissen begabt und sollen einander im Geist der Brüderlichkeit begegnen. + +German, Standard (1996) +Alle Menschen sind frei und gleich an Würde und Rechten geboren. Sie sind mit Vernunft und Gewissen begabt und sollen einander im Geist der Brüderlichkeit begegnen. + +Gilyak +Сик нивгун куғытӻарта, пʼинамад яймта адяй правоғир̌ пʼӊафқ-ӊафқғир̌ салӻата ӿат пантадғун. + +Gonja +Bu kurwe dimedi kikɛ mobe kumu so, nɛ mobe, eyilikpa, kesheŋ nɛ kashinteŋ maŋ kɔr eko peyɛ to. Nyinpela sa dimedi kikɛ lakal nɛ mfɛra fanɛ bu chena abarso kelepo so. + +Greek (monotonic) +Όλοι οι άνθρωποι γεννιούνται ελεύθεροι και ίσοι στην αξιοπρέπεια και τα δικαιώματα. Είναι προικισμένοι με λογική και συνείδηση, και οφείλουν να συμπεριφέρονται μεταξύ τους με πνεύμα αδελφοσύνης. + +Greek (polytonic) +Ὅλοι οἱ ἄνθρωποι γεννιοῦνται ἐλεύθεροι καὶ ἴσοι στὴν ἀξιοπρέπεια καὶ τὰ δικαιώματα. Εἶναι προικισμένοι μὲ λογικὴ καὶ συνείδηση, καὶ ὀφείλουν νὰ συμπεριφέρονται μεταξύ τους μὲ πνεῦμα ἀδελφοσύνης. + +Guaraní, Paraguayan +Mayma yvypóra ou ko yvy ári iñapytlʼyre ha eteĩcha dignidad ha derecho jeguerekópe; ha ikatu rupi oikuaa añetéva ha añeteʼyva, iporãva ha ivaíva, tekotevẽ pehenguéicha oiko oñondivekuéra. + +Guarayu +Opakatu ava yoro’a nda’ei tembigwaigwa oyoyatupri, sekotupri, vaëra, imboeteisara, oikatu ipi’a yemoñeta, imbaekua, ndiyai yurekorairai ñepëi pëi ambua rese. + +Gujarati +પ્રતિષ્ઠા અને અધિકારોની દૃષ્ટિએ સર્વ માનવો જન્મથી સ્વતંત્ર અને સમાન હોય છે. તેમનામાં વિચારશક્તિ અને અંતઃકરણ હોય છે અને તેમણે પરસ્પર બંધુત્વની ભાવનાથી વર્તવું જોઇએ. + +Gumuz +Dubꞌaga bꞌaga metaam metaam alamaam kamaanzaakꞌoma kasꞌe bipokꞌoga kamadꞌab maafucꞌakꞌwa haaga bacꞌaga tso. Kábꞌaga jajanda kwa jala etigafalagash maꞌiiya nago metaagwa eyaal yida-eba bicꞌaga tso. + +Haitian Creole French (Kreyol) +Tout moun fèt lib, egal ego pou diyite kou wè dwa. Nou gen la rezon ak la konsyans epi nou fèt pou nou aji youn ak lot ak yon lespri fwatènite. + +Haitian Creole French (Popular) +Tout moun sou tè a fèt tou lib. Tout gen menm valè (nan je lasosyete), tout moun gen menm dwa devan Lalwa. Tout moun fèt ak yon bonsans, tout fèt ak yon konsyans epi youn fèt pou trete lòt tankou frè ak sè. + +Hani +Aqsol liq yoqdeivq yoqpyuq bo, meeqyaovq ssolnei colpyuq qiq kov dei. Davqtavcolssaq neenyuq bel neema meeq ya siq, laongaoq meilnaol nadul meil e gaq ssol hhyul hha bavqduv nia. + +Hausa +Duk ‘yan’adan ana haihuwarsu ne a matsayin ‘yantattun ‘ya’ya, kuma mutuncinsu da haqqoqinsu daidai yake da na kowa. Suna da tunani da cikakken hankali, saboda haka ake son duk mu’amalar da za su yi, ta kasance akwai ‘yan’uwantaka a tsakani. + +Hausa (Niger) +Su dai ƴan‐adam, ana haifuwarsu ne duka ƴantattu, kuma kowannensu na da mutunci da hakkoki daidai da na kowa. Suna da hankali da tunani, saboda haka duk abin da za su aikata wa juna, ya kamata su yi shi a cikin ƴan‐uwanci. + +Hausa (Nigeria) +Su dai ‘yan-adam, ana haifuwarsu ne duka ‘yantattu, kuma kowannensu na da mutunci da hakkoki daidai da na kowa. Suna da hankali da tunani, saboda haka duk abin da za su aikata wa juna, ya kamata su yi shi a cikin ‘yan-uwanci. + +Hawaiian +Hānau kū’oko’a ‘ia nā kānaka apau loa, a ua kau like ka hanohano a me nā pono kīvila ma luna o kākou pākahi. Ua ku’u mai ka no’ono’o pono a me ka ‘ike pono ma luna o kākou, no laila, e aloha kākou kekahi i kekahi. + +Hebrew +כל בני אדם נולדו בני חורין ושווים בערכם ובזכויותיהם. כולם חוננו בתבונה ובמצפון, לפיכך חובה עליהם לנהוג איש ברעהו ברוח של אחוה. + +Hiligaynon +Ang tanan nga tao ginbun-ag nga hilway kag may pag-alalangay sa dungog kag katarungan. Sila ginhatagan sang pagpamat-od kag konsensya kag nagakadapat nga magbinuligay sa kahulugan sang pag-inuturay. + +Hindi +सभी मनुष्यों को गौरव और अधिकारों के मामले में जन्मजात स्वतन्त्रता और समानता प्राप्त है । उन्हें बुद्धि और अन्तरात्मा की देन प्राप्त है और परस्पर उन्हें भाईचारे के भाव से बर्ताव करना चाहिए । + +Hindustani, Sarnami +Sab djanne aadjádi aur barabar paidaa bhailèn, iddjat aur hak mê. Ohi djanne ke lage sab ke samadj-boedj aur hierdaai hai aur doesare se sab soemmat sè, djaane-maane ke chaahin. + +Hmong, Northern Qiandong +Laix laix diangl dangt lol sob dab yangx ghax maix zit yef, niangb diot gid zenb nieef haib gid quaif lit gid nongd jus diel pinf denx. Nenx dol maix laib lix xent haib jox hvib vut, nenx dol nongt liek bed ut id xit deit dait. + +Hmong, Southern Qiandong +Leb leb nis zib youl nangs, mex ad sheit nangd zend yanl nhangs njanl lib. Mix mex lix xinb gaot liangt send, leb leb lies nhangs ghob nab ghob geud nangd. + +Hmong Njua +Cuat lenx cuat dol bongb deul ndax dex douf muax zif youx, nyaob shout zunb yinx tab ndas dos id, dax zis ib suk. Nil buab daf lol jaox muax lid xinf hlub hout tab liangx xinb shab nzhuk, yinf gaib keuk suk gud dix mol lol nit jinb shenx lol shib daf shib hlad. + +Huastec (San Luís Potosí) +Patal an inik ani an uxum u wa'tsinal walkadh abal jununúl kin bats'uw an alwa'taláb ani ka pidhan in éy jant'ini' in tomnál; in kwa'al in tsalpádh ani in k'ayá' abal kin k'anidha' in juntal. + +Huastec (Sierra de Otontepec) +Kuentsal nap wah-chínal tee ti chabal jayechek-i antip wah-chínal, bá tamá maxak a pulik maxak in exlal, jununul aní ni chap aní jaxtam ko-yal kip le-naxín aní ki k-ana ti ba. + +Huastec (Veracruz) +Ejtal an kw'ajiiltsik u wa'chinal kweteem ani chu'udh k'al an chu'uxtalaab ani yajat ka k'aak'naaxin juun ani juun. + +Huitoto, Murui +Nana caɨ comuillamona dama caɨ abido itɨcaɨ. Caɨ comuillamona jɨaɨmɨe anamo iñedɨcaɨ. Nana daje facaiconi itɨcaɨ. Abɨ uiñuanona comuidɨcaɨ. Danɨ coninɨrie caɨ nabairilla. + +Hungarian +Minden. emberi lény szabadon születik és egyenlő méltósága és joga van. Az emberek, ésszel és lelkiismerettel bírván, egymással szemben testvéri szellemben kell hogy viseltessenek. + +Ibibio +Kpukpuru owo emana nte amanison, enyun enyene ukem ukem uku ye unen. Eyoho mmo ye ukeme ndikere nkpo, ndinyun nyene esit, ke ntre, mmo enyene ndiman nkpo mbana kiet eken ke esit ndito eka. + +Icelandic +Hver maður er borinn frjáls og jafn öðrum að virðingu og réttindum. Menn eru gæddir vitsmunum og samvizku, og ber þeim að breyta bróðurlega hverjum við annan. + +Ido +Omna homi naskas libera ed egala relate digneso e yuri. Li es dotita per raciono e koncienco e devas agar vers l'una l'altra en spirito di frateso. + +Idoma +Ęgę ni modudu acę kęcę nya bęcę ęhehi aa ,hibi ęgͻ ma acę duu jonjilę ipu kocęgba nͻcę cęgba męml’ojonjilę ipu ͻdah ni yabͻ ͻcę nya. Odudu acę kwu ђwule ml’ohili otu męml’ocai kęla jͻcę ͻha ni yipu ͻtu ͻcę aa, higbͻ ma ͻcę higbo yͻda męml’ ͻmpa gunu lę bͻinę nu ma. + +Igbo +A mụrụ mmadụ nile n'ohere nakwa nha anya ugwu na ikike. E nyere ha uche na mmụọ ime ihe ziri ezi nke na ha kwesiri ịkpaso ibe ha agwa n'obi nwanne na nwanne. + +Ijo, Southeast +Kim’ owoumo se, keni bara ki na, pa zimi, ose keni bara kemi. Kim’se ye iroro, mani ikiou nana, enini kim’se dudu tari teme nana weri iyenri. + +Ilocano +Amin nga tao nga sibibiag ket naiyanak a siwawayawaya ken addaan iti agpapada nga dayaw ken kalintegan. Naikkanda ti panagikalintegan ken konsensya a nasken ti panagtitinnulong iti meysa ken meysa iti espiritu nga nainkak-absatan. + +Indonesian +Semua orang dilahirkan merdeka dan mempunyai martabat dan hak-hak yang sama. Mereka dikaruniai akal dan hati nurani dan hendaknya bergaul satu sama lain dalam semangat persaudaraan. + +Interlingua +Tote le esseres human nasce libere e equal in dignitate e in derectos. Illes es dotate de ration e de conscientia e debe ager le unes verso le alteres in un spirito de fraternitate. + +Inuktitut, Eastern Canadian +ᐃᓅᔪᓕᒫᑦ ᐊᓂᖅᑎᕆᔪᓕᒫᑦ ᐃᓅᓚᐅᕐᒪᑕ ᐃᓱᒪᕐᓱᕐᖢᑎᒃ ᐊᒻᒪᓗ ᐊᔾᔨᐅᖃᑎᒌᒃᖢᑎᒃ ᓂᕐᓱᐊᖑᓂᒃᑯᑦ ᐊᒻᒪᓗ ᐱᔪᓐᓇᐃᑎᑎᒍᑦ. ᐃᓱᖃᖅᑐᖁᑎᖃᕐᑎᑕᐅᕙᓕᕐᐳᑦ ᐱᔾᔪᑎᖃᕐᓂᒃᑯᑦ ᖃᑕᙳᑎᒌᑦᑎᐊᕆᐊᖃᕐᓂᒃᑯᓪᓗ. + +Inuktitut, Greenlandic +Inuit tamarmik inunngorput nammineersinnaassuseqarlutik assigiimmillu ataqqinassuseqarlutillu pisinnaatitaaffeqarlutik. Solaqassusermik tarnillu nalunngissusianik pilersugaapput, imminnullu iliorfigeqatigiittariaqaraluarput qatanngutigiittut peqatigiinnerup anersaavani. + +Italian +Tutti gli esseri umani nascono liberi ed eguali in dignità e diritti. Essi sono dotati di ragione e di coscienza e devono agire gli uni verso gli altri in spirito di fratellanza. + +Japanese +すべての人間は、生まれながらにして自由であり、かつ、尊厳と権利とについて平等である。人間は、理性と良心とを授けられており、互いに同胞の精神をもって行動しなければならない。 + +Japanese (Osaka) +すべての人間は、生まれながらにして自由やし、かつ、尊厳と権利とについて平等や。人間は、理性と良心とを授けられており、互いに同胞の精神をもって行動しな。 + +Japanese (Tokyo) +全部の人間は、生まれながらにして自由であり、かつ、尊厳と権利と について平等である。人間は、理性と良心とを授けられており、互いに同 胞の精神をもって行動しなければならない。 + +Javanese (Javanese) +꧋ꦱꦧꦼꦤ꧀ꦲꦸꦮꦺꦴꦁꦏꦭꦲꦶꦂꦫꦏꦺꦏꦤ꧀ꦛꦶꦩꦂꦢꦶꦏꦭꦤ꧀ꦢꦂꦧꦺꦩꦂꦠꦧꦠ꧀ꦭꦤ꧀ꦲꦏ꧀ꦲꦏ꧀ꦏꦁꦥꦝ꧉​ꦏꦧꦺꦃꦥꦶꦤꦫꦶꦁꦔꦤ꧀ꦲꦏꦭ꧀ꦭꦤ꧀ꦏꦭ꧀ꦧꦸꦱꦂꦠꦏꦲꦗꦧ꧀ꦥꦱꦿꦮꦸꦁꦔꦤ꧀ꦲꦁꦒꦺꦴꦤ꧀ꦤꦺꦩꦼꦩꦶꦠꦿꦤ꧀ꦱꦶꦗꦶꦭꦤ꧀ꦱꦶꦗꦶꦤꦺꦏꦤ꧀ꦛꦶꦗꦶꦮꦺꦴꦱꦸꦩꦢꦸꦭꦸꦂ꧉​ + +Javanese (Latin) +Saben uwong kalairake kanthi mardika lan darbe martabat lan hak-hak kang padha. Kabeh pinaringan akal lan kalbu sarta kaajab pasrawungan anggone memitran siji lan sijine kanthi jiwo sumadulur. + +Jola-Fonyi +Bukanak búrom nan kuwolimi kurere kererer di waafaw búrom. Kubabaj poop búyejet di karampenoor. + +Jula +Wólo’ lá, hádamaden’ bɛɛ ye hɔrɔn ye, bɛɛ ká kán lànbe ní hákɛyaw lá. Mɔgɔ bɛɛ ye hákilitigi ye, bɛɛ ye hákilima ye ; ò là, ù ká kán kà ɲgɔn mína ní bádenya ye. + +K'iche', Central +Konojel ri winaq are taq ke'alaxik pa junaman ya'tal chkech kakechab'ej ronojel ri utzil; utz kakib'ano, kakichomaj, kakib'ij jasa je' ri k'o pa kanima, rumal che ri junam kib'antajik. Rajawaxik xuqe' kakimulij kib' che utzukuxuk ri loq'ob'al pa we uwachulew. + +Kabardian +Цӏыху псори щхьэхуиту, я щӏыхьымрэ я хуэфащэхэмрэкӏэ зэхуэдэу къалъхур. Акъылрэ зэхэщӏыкӏ гъуазэрэ яӏэщи, зыр зым зэкъуэш зэхащІэ яку дэлъу зэхущытын хуейхэщ. + +Kabiyé +Palʊlʊʊ ɛyaaa nɛ pa-tɩ yɔɔ wɛʊ kpaagbaa nɛ pɛwɛɛ kɩmaŋ wala ɛsɩndaa. Palʊlʊʊ-wɛ nɛ pɔ-lɔŋ nɛ pa-maɣzɩm; mbʊ yekina nɛ pɔsɔɔlɩ ɖama se pɛkɛ ɛyaa pa-tɩŋgɛ. + +Kabuverdianu +Tudo ser humano na ês mundo nacê libri e igual na sê dignidade e na sês drêto. Na sês razon e na sês concénça, tudo arguem debê porcêdê pa co tudo guenti na sprito di fraternidadi. + +Kafa +Ubbe ashi bushoo shiijjeto tatoonaa ame megoona aalloon, oogoonaa wuroonon yechiiniye. Ikkoo baroona manittine shalligoonaa naboona yeshet shalligoon boono shaddeyoo hakkiimm qello boonoshich ichete. + +Kannada +ಎಲ್ಲಾ ಮಾನವರೂ ಸ್ವತಂತ್ರರಾಗಿಯೇ ಜನಿಸಿದ್ದಾರೆ. ಹಾಗೂ ಘನತೆ ಮತ್ತು ಹಕ್ಕುಗಳಲ್ಲಿ ಸಮಾನರಾಗಿದ್ದಾರೆ. ವಿವೇಕ ಮತ್ತು ಅಂತಃಕರಣಗಳನ್ನು ಪಡೆದವರಾದ್ದರಿಂದ ಅವರು ಪರಸ್ಪರ ಸಹೋದರ ಭಾವದಿಂದ ವರ್ತಿಸಬೇಕು. + +Kanuri, Central +Adamgana woso kambe katambo ye daraja-a hakkiwa-ason kalkalye. Hankal-a nazaru-asoro kəzəpkə ye suro hal nəmharamiben kamazasoga letaiyin ye. + +Kaonde +Bonse bantu basemwa bakasuluka kabiji baesakena pamo mubuneme. Baji na maana a kulanguluka kabiji bobila bantu bakwabo byubilo bakwibasekesha. + +Kaqchikel, Central +Konojel ri winaqi' kan kalaxib'en pe ri kolotajïk, ri junan kiq'ij, ri junan kejqalen, junan kich'ojib'al pa kik'aslen, xa achi'el k'a ri kik'ojlen, ri kinojib'al kichajin xa tik'amun k'a chi nimaläj konojel xtikajo' ki'. + +Karakalpak +Ҳәмме адамлар өз қәдир-қымбаты және ҳуқықларында еркин ҳәм тең болып туўылады. Оларға ақыл ҳәм ҳүждан берилген болып, бир-бирине туўысқанлық руўхындағы қатнаста болыўы тийис. + +Karelian +Kai rahvas roittahes vällinny da taza-arvozinnu omas arvos da oigevuksis. +Jogahizele heis on annettu mieli da omatundo da heil vältämättäh +pidäy olla keskenäh, kui vellil. + +Kasem +Ba loge nɔɔna maama se ba taa ye bedwe mo ba ŋwea de ba chega seini, ye fefeo teira kɔtaa. Wɛ pɛ ba swa de boboŋa mo se ba taa ye nubiu daane ye ba jege da ŋwaŋa. + +Kazakh +Барлық адамдар тумысынан азат және қадір‐қасиеті мен кұқықтары тең болып дүниеге келеді. Адамдарға ақыл‐парасат, ар‐ождан берілген, сондықтан олар бір‐бірімен туыстық, бауырмалдық қарым‐қатынас жасаулары тиіс. + +Khakas +Полған на кізі пос паза тиң тӧріпче паза тиң постың синін пілінгенін паза тӧрелерініңде полча. Олардың сағынғаны паза арығ сағыс пар паза харындастар чіли тудынарға киректер. + +Khasi +Ïa ki bynriew baroh la kha laitluid bad ki ïaryngkat ha ka burom bad ki hok. Ha ki la bsiap da ka bor pyrkhat bad ka jingïatiplem bad ha ka mynsiem jingsngew shipara ki dei ban ïatrei bynrap lang. + +Khmer, Central +មនុស្សទាំងអស់ កើតមកមានសេរីភាព និងសមភាព ក្នុងផ្នែកសេចក្ដីថ្លៃថ្នូរនិងសិទ្ធិ។ មនុស្ស មានវិចារណញ្ញាណនិងសតិសម្បជញ្ញៈជាប់ពីកំណើត ហើយគប្បីប្រព្រឹត្ដចំពោះគ្នាទៅវិញទៅមកក្នុងស្មារតីភាតរភាពជាបងប្អូន។ + +Khün +ᨾᨶᩩᩔ᩼ᨴ᩠ᨦᩢᩉᩖᩣ᩠ᨿᨠᩮ᩠ᨯᩨᨾᩣᨾᩦᨻ᩠ᨦᩈᩁᩓᩢᨹ᩠ᨿ᩵ᨦᨻ᩠ᨿᨦᨠ᩠ᨶᩢ ᨶᩱᨠᩥᨲ᩠ᨲᩥᩈ᩠ᨠᩢ ᩓᩢᩈᩥᨴ᩠ᨵᩥ ᨲ᩵ᩣ᩠ᨦᨣᩳ᩶ᨣᩢᨾᩦᨾᨶᩮᩣᨵᨾ᩠ᨾ᩼ᩓᩢ ᨣ᩠ᩅᩁᨷᨭᩥᨷ᩠ᨲᩢᨲᩳ᩵ᨠ᩠ᨶᩢᨯᩢ᩠ᩅ᩠ᨿᨣ᩠ᩅᩣ᩠ᨾᨹ᩠ᨿ᩵ᨦᨻ᩠ᨿᨦᨠ᩠ᨶᩢ + +Kirghiz +Бардык адамдар өз беделинде жана укуктарында эркин жана тең укуктуу болуп жаралат. Алардын аң‐сезими менен абийири бар жана бири‐бирине бир туугандык мамилекылууга тийиш. + +Kissi, Northern +wanda tu cio Mɛ pilɔɔ o wolɔɔ ni, le waa o ba ndɔɔ cio, o bɛɛlen kenando ni, o tɔngdo ni, bɛtu nɔn yiyando a kullo, o kon ni naan tu dua mim maalyan kalapilɔyɛyi ni. + +Kituba +Bantu nyonso, na mbutukulu kevwandaka na kimpwanza ya bawu, ngenda mpe baluve ya mutindu mosi. Mayela na mbanzulu je na bawu, ni yawu yina bafwana kusalasana na bumpangi. + +Kituba (2) +Bantu nyonso ntangu bawu ke butukaka, bawu ke vwandaka na kimpwanza, ya kele mutindu mosi mpe na yina me tadila buzitu ya nzutu mpe baluve ya bawu. Bawu kele na mayindu mpe na bumuntu. Mpe nyonso yina bawu fwana kusala na sika ya bantu ya nkaka, bawu fwana kusala yawu na mpeve ya kimpangui. + +Komi-Permyak +Быдӧс отирыс чужӧны вольнӧйезӧн да ӧткоддезӧн достоинствоын да правоэзын. Нылӧ сетӧм мывкыд да совесть овны ӧтамӧдныскӧт кыдз воннэзлӧ. + +Konjo +Abandu omububuthiranwa bakabuthawa ibanawithe obuthoki nobuholho obulingirirene, mobahangikwa ibanawithe amenge, neryo ibakathoka erighabania abathya ekibuya nekisandire. Nokweryo buli muyima atholere eryanza munyikiwe ngababuthenwe. + +Koongo +Bantu nyonso, na mbutukulu kevwandaka na kimpwanza ya bawu, ngenda mpe baluve ya mutindu mosi. Mayela na mbanzulu je na bawu, ni yawu yina bafwana kusalasana na bumpangi. + +Koongo (Angola) +Bizingi bioso bisiwu ti batu bambutukanga mu kidedi ki buzitu ayi kibumswa. Bizingi-bene, batu, badi diela ayi tsi-ntima, bafwene kuzingila mbatzi-na-mbatzi-yandi mu mtima bukhomba. + +Korean +모든 인간은 태어날 때부터 자유로우며 그 존엄과 권리에 있어 동등하다. 인간은 천부적으로 이성과 양심을 부여받았으며 서로 형제애의 정신으로 행동하여야 한다. + +Kpelle, Guinea +Nukan gele kaa pələ kaa tanɔn, yiliɓa nu kəle maawiyə pələ da tɔɔi gaa ɲei yɛnɛyii hu kɛpələ kaalɔ tanɔn; di kɛmɛni a nukan ŋaa ɓə gɛɛ hwəkɛli wɛlikɛmaa ə lɔ di luwai. + +Krio +ɛvribɔdi bɔn fri ɛn gɛt in yon rayt, nɔn wan nɔ pas in kɔmpin. Wi ɔl ebul fɔ tink ɛn fɛnɔt wetin rayt ɛn rɔŋ pantap dat wi fɔ sabi aw fɔ liv lɛk wan big famili. + +Kulango, Bouna +Igooyoo pɛɛ hʋn taa. Bɔ pɛɛ jabaga bɔrɔ. Hɔ ya gʋʋn’n bɔɔ hɛ pɛɛ, hɔ hɛ gusɛgɛ’n. + +Kurdish, Central +Hemû mirov azad û di weqar û mafan de wekhev tên dinyayê. Ew xwedî hiş û şuûr in û divê li hember hev bi zihniyeteke bratiyê bilivin. + +Kurdish, Northern +Hemû mirov azad û di weqar û mafan de wekhev tên dinyayê. Ew xwedî hiş û şuûr in û divê li hember hev bi zihniyeteke bratiyê bilivin. + +Ladin +Dötes les porsones nasc lëdies y cun la medema dignité y i medemi dërć. Ares à na rajun y na cosciënza y mëss s’incuntè öna cun l’atra te n spirit de fraternité. + +Ladino +Todos los umanos nasen libres i iguales en dinyidad i derechos i, komo estan ekipados de razon i konsensia, deven komportarsen kon ermandad los unos kon los otros. + +Lamnso' +Á dzə̀ə́ wir dzə̀m réŋréŋ fó ghvəm wùn à fó ghày, á yo’ dzə̀ə́ wir msòŋ ji kwàn. Wìr dzə̀m k̀m k fómo woo fó kwà’tì wùn à fó vifii, a wù kér fó a yiì e wùmò’ woo wír moo fə́r və. + +Lao +ມະນຸດເກີດມາມີສິດເສລີພາບ ແລະ ສະເໝີໜ້າກັນໃນທາງກຽດຕິສັກ ແລະ ທາງສິດດ້ວຍມະນຸດມີສະຕິສຳປັດຊັນຍະ(ຮູ້ດີຮູ້ຊົ່ວ)ແລະມີມະໂນທຳຈື່ງຕ້ອງປະພຶດຕົນຕໍ່ກັນໃນທາງພີ່ນ້ອງ. + +Latin +Omnes homines dignitate et iure liberi et pares nascuntur, rationis et conscientiae participes sunt, quibus inter se concordiae studio est agendum. + +Latin (1) +Omnes homines liberi aequique dignitate atque juribus nascuntur. Ratione conscientiaque praediti sunt et alii erga alios cum fraternitate se gerere debent. + +Latvian +Visi cilvēki piedzimst brīvi un vienlīdzīgi savā pašcieņā un tiesībās. Viņi ir apveltīti ar saprātu un sirdsapziņu, un viņiem jāizturas citam pret citu brālības garā. + +Latvian (2) +Visi cilvēki piedzimst brīvi un vienlīdzīgi cieņā un tiesībās. Viņiem ir dots saprāts un sirdsapziņa, un viņiem citam pret citu jāizturas brālības garā. + +Ligurian +Tutte e personn-e nascian libere e pæge in dignitæ e driti. Son dotæ de raxon e coscensa e gh’an da agî l’unn-a verso l’atra inte ’n spirito de fradelansa. + +Limba, West-Central +Biya-mɛti fooma be kiyo ka kuyankaŋ iŋ kasɛmbɛ mɛnɛ in ka yiki. Bindɛ kiŋ ba niyɔ in masimɔkɔ, maka yiina wo ka hu wɛndi yande. + +Lingala +Bato nyonso na mbotama bazali nzomi pe bakokani na limemya pe makoki. Bazali na mayele pe basengeli kofanda na bondeko okati na bango. + +Lingala (tones) +Bato nyɔ́nsɔ na mbótama bazalí nsɔ́mí mpé bakókání na limɛmya mpé makokí. Bazalí na mayɛ́lɛ mpé basengélí kovánda na bondeko o káti na bangó. + +Lithuanian +Visi žmonės gimsta laisvi ir lygūs savo orumu ir teisėmis. Jiems suteiktas protas ir sąžinė ir jie turi elgtis vienas kito atžvilgiu kaip broliai. + +Lobi +Teehuu sʋnɔ n ther ɛɛ nɩɩ bʋnɔ wa n do deeaʔ sɩ wʋ n makha samɩnɩ na nà hʋ tɩnɛpar rà. Thangba ti yɛr à pɛ yɛr jɩɩr nà fɩlwɛ sɩ a teena waan fʋkha omkhaa. + +Lozi +Batu kaufela ba pepilwe inge ba lukuluhile ni liswanelo ze swana. Ba ba ni swanelo ya ku nahana mi ba swanela ku ba ni likezo za buzwale ku mutu yo mung'wi. + +Luba-Kasai +Bantu bonsu badi baledibwa badikadile ne badi ne makokeshi amwe. Badi ne lungenyi lwa bumuntu ne kondo ka moyo, badi ne bwa kwenzelangana malu mu buwetu. + +Lunda +Muntu wejima wasemuka walukbuka wesekana hamu ni akwawu mukumulemesha. Wenkewa kutong'ojoka nikuzatila hamu nimukwawu muntu muwunta'a. + +Luvale +Vatu vosena vasemuka yapwa hohamwe nakweseka mukuyoya chavo. Vatwama nachiyoyelo chalusesa chajingolo chakuzanga kulivwashana muchiyoyelo chavo. + +Luxembourgeois +All Mënsch kënnt fräi a mat deer selwechter Dignitéit an dene selwechte Rechter op d'Welt. Jiddereen huet säi Verstand a säi Gewësse krut an soll an engem Geescht vu Bridderlechkeet denen anere géintiwwer handelen. + +Macedonian +Сите човечки суштества се раѓаат слободни и еднакви по достоинство и права. Тие се обдарени со разум и совест и треба да се однесуваат еден кон друг во духот на општо човечката припадност. + +Madura +Sadajana oreng lahir mardika e sarenge drajat klaban hak-hak se dha-padha. Sadajana eparenge akal sareng nurani ban kodu areng-sareng akanca kadi taretan. + +Magahi +सब लोग आजादे जन्म लेब हई तथा सब के बराबरे सम्मान और अधिकार हइ। हुनखो के पास समझ-बूझ और अंत:करण के आवाज होब हई। और हुनका दोसरो के साथ भाईचारा के व्यवहार करे पड़ हई। + +Maithili +सभ मानव जन्मतः स्वतन्त्र अछि तथा गरिमा आʼ अधिकारमे समान अछि। सभकेँ अपन–अपन बुद्धि आʼ विवेक छैक आओर सभकेँ एक दोसराक प्रति सौहार्दपूर्ण व्यवहार करबाक चाही। + +Makhuwa +Atthu othene aniyaria oolikana ni owilamula moota ontthunaya okhala, variyari v’edignidade ni edireito. Akhalanne esaria ni otthokelela, ahaana akhalasaka othene saya vamurettele. + +Makonde +Vanu vohevohe vaidile n’chilambo valendene. Vanijaliwa ulimala vene. Pavele vanu pave na ulongo. + +Malagasy, Plateau +Teraka afaka sy mitovy zo sy fahamendrehana ny olombelona rehetra. Samy manan-tsaina sy fieritreretana ka tokony hifampitondra am- pirahalahiana. + +Malay (Arabic) +سموا مأنسي دلاهيركن بيبس دان سامرات دري سڬي كموليأن دان حق٢. مريك ممڤوڽاي ڤميكيرن دان ڤراسأن هاتي دان هندقله برتيندق د انتارا ساتو سام لائن دڠن سماڠت ڤرساودارأن. + +Malay (Latin) +Semua manusia dilahirkan bebas dan samarata dari segi kemuliaan dan hak-hak. Mereka mempunyai pemikiran dan perasaan hati dan hendaklah bertindak di antara satu sama lain dengan semangat persaudaraan. + +Malayalam +മനുഷ്യരെല്ലാവരും തുല്യാവകാശങ്ങളോടും അന്തസ്സോടും സ്വാതന്ത്ര്യത്തോടുംകൂടി ജനിച്ചിട്ടുള്ളവരാണ്‌. അന്യോന്യം ഭ്രാതൃഭാവത്തോടെ പെരുമാറുവാനാണ്‌ മനുഷ്യന്നു വിവേകബുദ്ധിയും മനസ്സാക്ഷിയും സിദ്ധമായിരിക്കുന്നത്‌. + +Malayalam +മനുഷ്യരെല്ലാവരും തുല്യാവകാശങ്ങളോടും അന്തസ്സോടും സ്വാതന്ത്ര്യത്തോടുംകൂടി ജനിച്ചിട്ടുള്ളവരാണ്‌. അന്യോന്യം ഭ്രാതൃഭാവത്തോടെ പെരുമാറുവാനാണ്‌ മനുഷ്യന്നു വിവേകബുദ്ധിയും മനസ്സാക്ഷിയും സിദ്ധമായിരിക്കുന്നത്‌. + +Maldivian +ހުރިހާ އިންސާނުންވެސް ދުނިޔެއަށް އުފަންވަނީ، މިނިވަންކަމުގައި، ހަމަހަމަ ޙައްޤުތަކަކާއެކު، ހަމަހަމަ ދަރަޖައެއްގައި ކަމޭހިތެވިގެންވާ ބައެއްގެ ގޮތުގައެވެ. ހެޔޮ ވިސްނުމާއި، ހެޔޮބުއްދީގެ ބާރު އެމީހުންނަށް ލިބިގެންވެއެވެ. އަދި އެކަކު އަނެކަކާމެދު އެމީހުން މުޢާމަލާތް ކުރަންވާނީ، އުޚުއްވަތްތެރިކަމުގެ ރޫޙެއްގައެވެ. + +Maltese +Il-bnedmin kollha jitwieldu ħielsa u ugwali fid-dinjità u d-drittijiet. Huma mogħnija bir-raġuni u bil-kuxjenza u għandhom iġibu ruħhom ma’ xulxin bi spirtu ta’ aħwa. + +Mam, Northern +Kyaqiilqe winaq nchi itz'aj tuj kopib'il, juunx kychuwiinqal b'ix kyokleen, kyja'tzan tuj tb'aanal xiinv'il tu'n kyanq'iin tuj b'ank'u'j kyxool. + +Maninkakan, Eastern +Adamadennu bɛɛ sɔdɔnɲa kakan, hɔrɔya dɔ, fabadenɲa dɔ ani sariya ta fan dɔ. Hankili ni sɔnɔmɛ ye alu bɛɛ ma, a kakan wo dɔ alu ye bakelenɲa sila lataaman alu ɲɔɔn tɛ. + +Manx +Ta dy chooilley ghooinney ruggit seyr as corrym rish dy chooilley ghooinney elley ayns ooashley as ayns cairys. Ta resoon as cooinsheanse stowit orroo as lhisagh ad dellal rish y cheilley lesh spyrryd braaragh. + +Maori +Ko te katoa o nga tangata i te whanaungatanga mai e watea ana i nga here katoa; e tauriterite ana hoki nga mana me nga tika. E whakawhiwhia ana hoki ki a ratou te ngakau whai whakaaro me te hinengaro mohio ki te tika me te he, a e tika ana kia meinga te mahi a tetahi ki tetahi me ma roto atu i te wairua o te noho tahi, ano he teina he tuakana i ringa i te whakaaro kotahi. + +Mapudungun +Kom pu mogence kisuzuam mvlekey, kom cegeygvn, logkogeygvn ka piwkegeygvn, nieygvn kimvn fey mew mvley tañi yamniewael ka epuñpvle kejuwael egvn. + +Marathi +सर्व मानवी व्यक्ति जन्मतःच स्वतंत्र आहेत व त्यांना समान प्रतिष्ठा व समान अधिकार आहेत. त्यांना विचारशक्ति व सदसविद्वेकबुद्धि लाभलेली आहे. व त्यांनी एकमेकांशी बंधुत्याच्या भावनेने आचरण करावे. + +Marshallese +Armij otemjej rej rujlok ilo anemkwoj im jonon utiej eo im maron ko air wot juon. Emwij lelok non ir maron in bukot non ir make im bareinwot boklikot kin men ko rej tomaki im bwe jerbal non dron ilo juon jitobon jimpenjatin. + +Matsés +Chidon tishaido yec matses abitedimbo bëdamboec isnanac bëdambo ictsiash. Chieshnanac icsambo ictsiash. Abitedimbo bëdamboec tabadac bëdambo ictsiash. Shubu abentsëcquidën tabadac birnboec abitedi tabadac bëdambo ictsiash - quequin chuipanëdash nidaid abitedinoësh cho-choquidon. + +Maya, Yucatán +Tuláakal wíinik ku síijil jáalkʼab yetel keet u tsiikul yetel Najmal Sijnalil, beytun xan naʼataʼan sijnalil yetel noʼojaʼanil u tuukuloʼ, kʼaʼabet u bisikuba bey láaktzilil yetel tuláakal u baatzileʼ. + +Mazahua Central +Texe yo nte̱'e̱ chjetrjoji, angezeji ximi xo'oji ñeje k'inchiji, nesta ra ngara na jo'o k'o dyaja e nte̱'e̱. + +Mazatec, Ixcatlán +Nga ndindie xuta ngatsen de’e ko ngondsejen ngatjin-kjua nga xchandinkon nt’a ngondsejen ngatjin kokjin-tokon,kotjinkjua nga takie engajan skuendinkon xkjin. + +Mbundu +O athu woso avwala abhuluka ni kusokela mu kijingu ni mu itekelu. Ene ala ni ulungilu ni kilunji ni atokala kulaya kumoxi nya akwa mu mixima ya undandu. + +Mbundu (009) +Mutu uoso uoso a mu vuala ni ufolo ni kutena kumoxi mu kijingu ni mu ubinganu. Mu kilembu kia kubanga ni mu ubanzelu, Atena uê kubanga ioso kua akua mu muxima ua tululuka mba upange. + +Mende +Numuvuisia Kpɛlɛɛ ta ti le tɛ yɛ nduwɔ ya hu, tao ti nuvuu yei kɛɛ ti lɔnyi maa hɛwungɔ. Kiiya kɛɛ hindaluahu gɔɔla a yɛlɔ ti hun. Fale mahoungɔ ti ti nyɔnyɔhu hoi kia ndeegaa. + +Micmac +Msit mimajulnu’k weskwijinu’ltijik alsumsultijik aqq newte’ tett wkpimte’tmut aqq koqwajo’taqnn wejkul’aqmititl. + +Minangkabau +Sadonyo manusia dilahiakan mardeka dan punyo martabat sarato hak-hak nan samo. Mareka dikaruniai aka jo hati nurani, supayo satu samo lain bagaul sarupo urang badunsanak. + +Mískito +Upla sut ba kulkanka lakara, airaitka nanira bara pri, sin, aikuki, baku takisa. Bamna sins laka bri baku, lukanka bain pri baku aimuihni lakara, pana pana tabaikan kaiasa. + +Mixe, Totontepec +Tum akijpxa xa ve’e jayu kye’ex, ve’em ax jö’n tyukidaakjüva tijaty mëkin; ve’empa axjö’n jä jyööjtykin di yaknaxy, jats oy myujatyöö’tëjk di mëët nayjavajüt. + +Mixtec, Metlatónoc +Taka ma ñayi nguiakoi ñayivi ñatu na ja'a tnu'u ja kusa'a ndeva'ña-i, su'uva kajito va'aña-i, yuka ku ja jiniñu'u ja kukototna-i. + +Mizo +Mi zawng zawng hi zalêna piang kan ni a, zahawmna leh dikna chanvoah intluk tlâng vek kan ni. Chhia leh tha hriatna fîm neia siam kan nih avangin kan mihring puite chungah inunauna thinlung kan pu tlat tur a ni. + +Moba +Nifoi kul maal yendu buam po i, k b yudand yen b yiko-nba biɛ ja. B mɔg maalm g ban yal g ŋan, g biɛ baa bu yen lieb naataann n ninŋ i. + +Mon +မၞိဟ်ဂမၠိုၚ် အိုဿီုတအ်ဝွံ စနူသၠးတိတ် နူဂဝ်ဂၞဴ ဒှ်မၞိဟ်သၠးပွးအိုတ်တုဲ အခေါၚ်အရာ ကေုာံ သိက္ခာမၞိဟ်တအ် တုပ် သၟဟ်ရ။ မၞိဟ်တအ်ဂှ် နွံကဵုဓရ်စၚ်ခြၚ်ကေုာံ သမ္တီညာဏ် ဓဝ်ပါ်ပဲါ ခိုဟ်ပရေံနွံတုဲ ညးမွဲ ကေုာံ ညးမွဲ ထေက်ကဵု သ္ဒးဒ္ဂေတ်ဗက် ဆက်ဆောံညးသ္ကအ် နစိုတ်ဓာတ်ကောံဒေံအရေ။ + +Mongolian, Halh (Cyrillic) +Хүн бүр төрж мэндлэхэд эрх чөлөөтэй, адилхан нэр төртэй, ижил эрхтэй байдаг. Оюун ухаан, нандин чанар заяасан хүн гэгч өөр хоорондоо ахан дүүгийн үзэл санаагаар харьцах учиртай. + +Mongolian, Halh (Mongolian) + + ᠬᠦᠮᠦᠨ ᠪᠦᠷ ᠲᠥᠷᠥᠵᠦ ᠮᠡᠨᠳᠡᠯᠡᠬᠦ ᠡᠷᠬᠡ ᠴᠢᠯᠥᠭᠡ ᠲᠡᠢ᠂ ᠠᠳᠠᠯᠢᠬᠠᠨ ᠨᠡᠷ᠎ᠡ ᠲᠥᠷᠥ ᠲᠡᠢ᠂ ᠢᠵᠢᠯ ᠡᠷᠬᠡ ᠲᠡᠢ ᠪᠠᠢᠠᠭ᠃ ᠣᠶᠤᠨ ᠤᠬᠠᠭᠠᠨ᠂ ᠨᠠᠨᠳᠢᠨ ᠴᠢᠨᠠᠷ ᠵᠠᠶᠠᠭᠠᠰᠠᠨ ᠬᠦᠮᠦᠨ ᠬᠡᠭᠴᠢ ᠥᠭᠡᠷ᠎ᠡ ᠬᠣᠭᠣᠷᠣᠨᠳᠣ᠎ᠨ ᠠᠬᠠᠨ ᠳᠡᠭᠦᠦ ᠢᠨ ᠦᠵᠢᠯ ᠰᠠᠨᠠᠭᠠ ᠥᠠᠷ ᠬᠠᠷᠢᠴᠠᠬᠥ ᠤᠴᠢᠷ ᠲᠠᠢ᠃ + + +Montenegrin +Sva ljudska bića rađaju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i savješću i jedni prema drugima treba da postupaju u duhu bratstva. + +Mòoré +Ninsaalbã fãa sã n doge, ned fãa so a menga, ned pa rogd n yaa yamb ye, nebã fãa zema taab b yel-segdɩ la b burkĩndlem wɛɛngẽ. Nebã fãa tara yam la tagsgo, ned fãa togame n vɩɩnd ne a to saam-biir pʊgẽ. + +Moro +Leđa pređ lalǝŋǝnia lëbǝrëinialo na lǝɽǝwaṯo eŋen ŋǝđamia na eŋen pređ iŋi ŋǝrcađaṯo ṯa leđa alǝfiđi. Lënŋulu pređ lananëinu đǝnaca đame ɽetǝɽeto na ara gǝŋǝra ŋenŋanṯa alǝɽǝwađaṯe alamǝđaiđe bǝɽan usilaga gǝŋǝlǝŋǝnia na gǝŋorba. + +Mozarabic +Totos les esseres humanos nascent libberos et eguales in dignitate e dretos e, dotatos commo stant de racione e conscientia, devent comportarse in germanitate les unos con les altros. + +Naga, Ao +Meimchir ajak temeten aser tashi kasa nüji nung asor. Parnok dak bilemtettsü shisatsü aser tangatetba kasa agüja aliba jagi külem adianu rongnung tanela ka nung lungjema alitsüla. + +Nahuatl, Central +Nochi tlakamej uan siuamej kipiaj manoj kuali tlakatisej, nochi san se totlatechpouiltilis uan titlatepanitalojkej, yeka moneki kuali ma timouikakaj, ma timoiknelikaj, ma timotlasojtlakaj uan ma timotlepanitakaj. + +Nanai +Хэмту найсал гипалин, мэнэ гэбудиэри, правосалдиари эмуту балдичи. Нёанчи муруӈку, дэрэлку, диа диавари а-нэу-мэт бодомари тагилайчи. + +Navajo +Bilaʼashdaʼii tʼáá ałtsoh yiníkʼehgo bidizhchįh dóó aheełtʼeego ílį́į́go bee baahóchįʼ. Eíí háníʼ dóó hánítshakees hwiihdaasyaʼ eíí binahjį́ʼ ahidiníłnáhgo álíleekʼehgo kʼé bee ahił niidlį́. + +Ndebele +Abantu bonke bazalwa bekhululekile njalo belingana kumalungelo abo. Balesipho sikanembeza, ngakho bamele baphathane ngomoya otshengisa ubuhlobo lobunye. + +Ndonga +Aantu ayehe oya valwa ye na emanguluko noye na ondilo yi thike pamwe osho wo uuthemba. Oye na omaipulo goondunge neiuvo onkene naa kalathane mombepo yuumwainathana. + +Nenets +Ет хибяри ненэць соямарианта хуркари правада тнява, ӈобой ненэця ниду нись токалба, ӈыбтамба илевату тара. + +Nepali +सबै व्यक्ति हरू जन्मजात स्वतन्त्र हुन ती सबैको समान अधिकार र महत्व छ। निजहरूमा विचार शक्ति र सद्धिचार भएकोले निजहरूले आपसमा भातृत्वको भावना बाट व्यवहार गर्नु पर्छ। + +Nganasan +Бәнде” ӈанасанә” ӈәтукәнды” нендя”туо” ӈонә хонсы хелиде” ӈиле мәнәй (правай). Сытыӈ хонды” ӈиле ӈонда ӈонә сяру, дүзытәндыӈ ихүтүӈ нягәә” сүөарусә”. + +Niue +Ko e tau tagata momoui oti kua fanau ai ke he fakatokanoaaga mo e fakatatai oti e tau tutuaga mo e tau tonuhia. Kua moua ai foki e lautolu e kakano mo e manamanatuaga ti kua lata ni ke fakafetui e taha ke he taha ke he agaga fakamatakainaga. + +Nomatsiguenga +Antagaisati matsiguenga ibogaiguë matsiguengasonorl. Aisati icantaigaca. Teni iromerataiguengani. Antagaisati iquengaigui aisati igóiguiro ora caninaro aisati igóiguiro ora te onganinate. Iroro caninataque omagaro matsiguenga iraniacaninataigueri ira basiniati matsiguenga aisati ingantaiguerí ora caninaro. + +Norwegian, Bokmål +Alle mennesker er født frie og med samme menneskeverd og menneskerettigheter. De er utstyrt med fornuft og samvittighet og bør handle mot hverandre i brorskapets ånd. + +Norwegian, Nynorsk +Alle menneske er fødde til fridom og med same menneskeverd og menneskerettar. Dei har fått fornuft og samvit og skal leve med kvarandre som brør. + +Nuosu +ꊿꂷꃅꄿꐨꐥ,ꌅꅍꀂꏽꐯꒈꃅꐥꌐ。ꊿꊇꉪꍆꌋꆀꁨꉌꑌꐥ,ꄷꀋꁨꂛꊨꅫꃀꃅꐥꄡꑟ。 + +Nyamwezi +Banhu bose bubyalagwa biyagalulile, n’ikujo haki zilenganelile. + +Nyanja (Chechewa) +Anthu onse amabadwa aufulu ndiponso ofanana mu ulemu ndi ufulu wao. Iwowa ndi wodalitsidwa ndi mphamvu zoganiza ndi chikumbumtima ndipo achitirane wina ndi mnzake mwaubale. + +Nyanja (Chinyanja) +Anthu onse amabadwa mwa ufulu ndiponso olinganga m' makhalidwe ao. Iwo amakhala ndi nzeru za cibadwidwe kotero ayenera kucitirana zabwino wina ndi mnzace. + +Nyankore +Abantu nibazaarwa baine obugabe nobushoborozi ebiri kwingana nibahangwa baine obwengye kandi barikubasa kwahura ekirungi nekibi, nahabwekyo abantu bashemereire kutuura kumwe nkabanya Uganda. + +Nyemba +Vanu voxe vakasemuka mu cizango co mumo lika mu vulemu co kulimanena. Vakevo vakala na mangana co na mbunge co vana pande kulinga vamo na vakwavo na mbunge ya vuna yina. + +Nzema +Menli muala di bɛ ti anwo na eza noko bɛsɛ wɔ dibilɛ nee adenlenyianlɛ nu. Bɛlɛ ndwenlenwo nee adwenle, yemɔti ɔwɔ kɛ bɛkile adiemayɛlɛ bɛmaa bɛ nwo ngoko. + +Occitan +Tóuti lis uman naisson libre. Soun egau pèrla digneta e li dre. An tóuti uno resoun e uno counsciènci. Se dèvon tenifreirenau lis un 'mé lis autre. + +Occitan (Auvergnat) +Ta la proussouna neisson lieura moé parira pà dïnessà mai dret. Son charjada de razou moé de cousiensà mai lhu fau arjî entremeî lha bei n'eime de freiressà. + +Occitan (Francoprovençal, Fribourg) +Totè lè dzin vinyon ou mondo libro è parê in dinyitâ è in drê. Chon dotâ dè réjon è dè konhyinthe è dêvon chè konportâ lè j’on-lè j’ôtro din on èchpri dè fratèrnitâ. + +Occitan (Francoprovençal, Savoie) +Tu luz òmò vinyon u mondo, librò, tu tòton pè leû dinyitò è leû drèye. Y’on tu d’émò è dè konhyinhi è i dèvon fè- mouhò dè fratèrnitò aouèy luz òtri. + +Occitan (Francoprovençal, Valais) +Tui lè jêtre humain néchon libro è pary in degnetâ é in drouê. Chon reijonâbló è dè counchieince è deivouon âzic lè j’oun vi j’avi di j’âtró in pèr oun espri dè fratèrnitâ + +Occitan (Francoprovençal, Vaud) +Tî lè z’ître humain vîgnant âo mondo libro et parâi dein la dignitâ et lè drâi. L’ant reçu réson et concheince et dâivant vivre lè z’on avoué lè z’autro quemet se sant frâre et chèra. + +Occitan (Languedocien) +Totes los èssers umans naisson liures e egals en dignitat e en dreches. Son dotats de rason e de consciéncia e se devon comportar los unes amb los autres dins un esperit de fraternitat. + +Ojibwa, Northwestern +ᑭᑲᓇᐌᓀᓐ ᑲᐱᒪᑎᓯᐗᑦ ᓂᑕᐎᑭᐗᒃ ᑎᐯᓂᒥᑎᓱᐎᓂᒃ ᒥᓇ ᑕᐱᑕ ᑭᒋᐃᓀᑕᑯᓯᐎᓐ ᑲᔦ ᑌᐸᑫᑕᑯᓯᐎᓐ. ᐅᑕᔦᓇᐗ ᒥᑲᐎᐎᓐ ᑲᔦ ᓂᑄᑲᐎᓐ ᒥᓇᐗ ᑕᔥ ᒋᐃᔑᑲᓇᐗᐸᑎᐗᐸᓐ ᐊᒐᑯ ᒥᓄᐎᒋᐎᑎᐎᓂᒃ. + +Okiek +Piik togol kosigotiik en katiagetapkei koguyet ak imandanyuwan koyuyosin togol kogigigochi ngomnotet ak koperuret en iyon konyolu koyochigei oteptop tupchondit. + +Orok +Чипа̄ли гурунне̄ балӡичи гэвумэ, омотто мэ̄нэ мөрөнӡи, мэ̄нэ доронӡи. Но̄чи идэлу, иркалу, мэ̄нэ мэ̄нӡи на̄дактаӈачи бјӣчи. + +Oromo, Borana-Arsi-Guji +Namooti hundinuu birmaduu ta'anii mirgaa fi ulfinaanis wal-qixxee ta'anii dhalatan. Sammuu fi qalbii ittiin yaadan waan uumamaan kennameef, hafuura obbolummaatiin walii-wajjin jiraachuu qabu. + +Oroqen +Beyel bambur zhiyu bishi, zhunyan-du bineken chuanli-du bambur pingdeng bishi. Nugartin lishing bineken liangshin bishi, akin nekun guanshi-ngi chingshen-du-in duidai-meet-ki-tin. + +Osetin +Адӕймӕгтӕ се 'ппӕт дӕр райгуырынц сӕрибарӕй ӕмӕ ӕмхуызонӕй сӕ барты. Уыдон ӕххӕст сты зонд ӕмӕ намысӕй, ӕмӕ кӕрӕдзийӕн хъуамӕ уой ӕфсымӕрты хуызӕн. + +Otomi, Mezquital +Gotho nu kja'ni i mu̱i ra zoo i gotho ro kuchti, i tu'ni nu ro ña padä bini i da budi, da mu̱i ra zoo koyu gotho yu kja'ni i yo kuadi. + +Otuho +lsiuni aati dang iko ahodc hade ihaniere erre boo ve isi orrijori dang to +nelotulo. Owoni isi iko negigilita bwo ve iko ataja. Ongida isi ihanie +awatek hosi ihwo elarak. + +Páez +Ya'nwe'wewa'te' maa nasapa ha'dacehk hi'pku up'hi', wëtte u'huwa'hi'pta', eena' eena' f'i'zewa' hi'pta', üus hi'pta' d'ik'the hi'pta' naapa'kate. Sa' h'ukaysa üus hi'pcehktha'w sa' pyakhna'we f'i'ze hi'ptha'w. + +Palauan +A rogui 'l chad el mechell a ngarngii a ilmokl er tir ra diosisiu el llemalt. Ngarngii er tir a uldesuir mete mo meruul el mo rar bebil lokiu a ungil 'l omeruul ra klauchad. + +Pampangan +Ding sablang tau mibait lang malaya at pante-pante king karangalan at karapatan. Ila mipagkaluban lang katuliran at konsensiya ay dapat misaupan king diwang pamikapatiran. + +Panjabi, Eastern +ਸਾਰਾ ਮਨੁੱਖੀ ਪਰਿਵਾਰ ਆਪਣੀ ਮਹਿਮਾ, ਸ਼ਾਨ ਅਤੇ ਹੱਕਾਂ ਦੇ ਪੱਖੋਂ ਜਨਮ ਤੋਂ ਹੀ ਆਜ਼ਾਦ ਹੈ ਅਤੇ ਸੁਤੇ ਸਿੱਧ ਸਾਰੇ ਲੋਕ ਬਰਾਬਰ ਹਨ । ਉਨ੍ਹਾਂ ਸਭਨਾ ਨੂੰ ਤਰਕ ਅਤੇ ਜ਼ਮੀਰ ਦੀ ਸੌਗਾਤ ਮਿਲੀ ਹੋਈ ਹੈ ਅਤੇ ਉਨ੍ਹਾਂ ਨੂੰ ਭਰਾਤਰੀਭਾਵ ਦੀ ਭਾਵਨਾ ਰਖਦਿਆਂ ਆਪਸ ਵਿਚ ਵਿਚਰਣਾ ਚਾਹੀਦਾ ਹੈ । + +Panjabi, Western +سارے انسان آزاد تے حقوق تے عزت دے لحاظ نال برابر پیدا ہوندے نیں ۔ ۔ اوہ عقل سمجھ تے چنگے مندے دی پچھان تے احساس رکھدے نے ایس واسطے اوہناں نوں اک دوجے نال بھائی چارے والا سلوک کرنا چاہی دا اے ۔ ۔ + +Papiamentu +Tur ser humano ta nace liber y igual den dignidad y den derecho. Nan ta dota cu rason y cu consenshi y nan mester comporta nan den spirito di fraternidad pa cu otro. + +Pashto, Northern +د بشر ټول افراد ازاد نړۍ ته راځي او د حيثيت او د حقوقو له پلوه سره برابر دي۔ ټول د عقل او وجدان خاوندان دي او بايد يو له بل سره د ورورۍ په روحيه سره چلنند کړي۔ + +Picard +Tos lès-omes vinèt å monde lîbes èt égåls po çou qu'èst d' leû dignité èt d' leûs dreûts. Leû re̊zon èt leû consyince elzî fe̊t on d'vwér di s'kidûre inte di zèle come dès frès + +Pidgin, Nigerian +Everi human being, naim dem born free and dem de equal for dignity and di rights wey we get, as human beings, God come give us beta sense wey we de take tink well, well and beta mind, sake for dis, we must to treat each other like broda and sister. + +Pijin +Evri man en mere olketa born frii en ikwol lo digniti en raits blo olketa. Olketa evriwan olketa garem maeni fo tingting en olketa sapos fo treatim isada wittim spirit blo bradahood. + +Pintupi-Luritja +Nganana maru tjuta, tjulkura tjuta, manta yurungka parrari nyinapayi tjutanya liipulala nyinanyi, nganana yanangu maru tjuta wiya kuyakuya. Yuwankarrangkuya palya nintingku kulini. Tjanaya palya kutjupa tjutaku tjukarurru nyinanytjaku, walytja tjuta nguwanpa, mingarrtjuwiya. Tjungungku palyangku kurrunpa kutjungku.Wangka ngaangku nganananya tjakultjunanyi rapa ngaranytjaku kutjupa tjuta nguwanpa. + +Pipil +Muchi ne tay gen tu weyga nestiwit tamagixti genga tik ekneliat wan ipal wan gichiwtiwit ipal ma munegigan ne se pal ne se. + +Pohnpeian +Tohn sampa karos ipwiwei nan saledek oh duwepenehte nan arail wasa oh arail pwung. Arail marain oh pehm ih utakerail kahrehda korusie konehng sawaspene nin duwen pirien ehu. + +Polish +Wszyscy ludzie rodzą się wolni i równi pod względem swej godności i swych praw. Są oni obdarzeni rozumem i sumieniem i powinni postępować wobec innych w duchu braterstwa. + +Portuguese (Brazil) +Todos os seres humanos nascem livres e iguais em dignidade e direitos. São dotados de razão e consciência e devem agir em relação uns aos outros com espírito de fraternidade. + +Portuguese (Portugal) +Todos os seres humanos nascem livres e iguais em dignidade e em direitos. Dotados de razão e de consciência, devem agir uns para com os outros em espírito de fraternidade. + +Pular +NEDDHANKE EN FOW DYIBINTE NO HETTII NO FOTA E DHI FOW, E NDIMU E HANDANDHI. BHE DYIBINDINTE E HAGGHIL E FAAMU ; HIBHE HAANI DYOGONDIRDE E NDER HAGGHIL NEENEGOOTAANKAAKU. + +Pular (Adlam) +𞤋𞤲𞥆𞤢𞤥𞤢 𞤢𞥄𞤣𞤫𞥅𞤶𞤭 𞤬𞤮𞤬 𞤨𞤮𞤼𞤭، 𞤲'𞤣𞤭𞤥𞤯𞤭𞤣𞤭 𞤫 𞤶𞤭𞤦𞤭𞤲𞤢𞤲𞥆𞤣𞤫 𞤼𞤮 𞤦𞤢𞤲𞥆𞤺𞤫 𞤸𞤢𞤳𞥆𞤫𞥅𞤶𞤭. 𞤉𞤩𞤫 𞤲'𞤺𞤮𞥅𞤣𞤭 𞤥𞤭𞥅𞤶𞤮 𞤫 𞤸𞤢𞤳𞥆𞤭𞤤𞤢𞤲𞤼𞤢𞥄𞤺𞤢𞤤 𞤫𞤼𞤫 𞤫𞤩𞤫 𞤨𞤮𞤼𞤭 𞤸𞤵𞥅𞤬𞤮 𞤲'𞤣𞤭𞤪𞤣𞤫 𞤫 𞤲'𞤣𞤫𞤪 𞤩 𞤭𞤴𞤲𞤺𞤵𞤴𞤵𞤥𞥆𞤢𞥄𞤺𞤵. + +Purepecha +Iamendu k'uiripuecha janguarhiparini ka majku jarhati ka jurhimbekuecha jingoni kueraaŋasondikso ka, juajtakuarhisïndiksï ambakiti eratsekua ka kaxumbikua, jatsistiksï eskaksï sesi arhijperaaka. + +Q'eqchi' +Chijunil li poyanam juntaq'eet wankil xloq'al naq nake'yo'la, ut kama' ak reheb' naq wan xna'leb'eb ut nake'reek'a rib', tento naq te'xk'am rib' sa' usilal chirib'ilrib'eb'. + +Quechua, Ambo-Pasco +Lapan runa kay pachach'u yurin libri kawananpaq, lapanchinuy iwal respetasha kananpaqmi, mana pipis jarupänanpaq, lapanpis iwal yarpach'akuy yach'aqmi, alita mana alita tantiyar kawananpaq. Chaynuy runa masinwan juknin jukninwan kuyanakur kapäkuchun + +Quechua, Arequipa-La Unión +Kanmi derechonchiskuna llapanchispa, nacesqanchismanta. Kantaqmi llapanchispa runa kayninchis. Manan runa kanchu manay derechoyoq. Huk runaq derecho hukpawan kaqllan kan. Kanmi derechonchis llapanchispa allin kawsay libre tiyananchispaq. Llapan runaqpan kan yuyayninchis yachanapaq. Llapanchis kasun llapa runa masinchiskunawan munanakunapaq, huk ayllu hina. + +Quechua, Ayacucho +Lliw runakunam nacesqanchikmantapacha libre kanchik, lliw derechonchikpipas iguallataqmi kanchik. Yuyayniyoq kasqanchikraykum hawkalla aylluntin hina kawsayta debenchik llapa runakunawan. + +Quechua, Cajamarca +Yumbay ollqokuna, warmikuna pullalla kashun leyninchiqkunawan. Manam ni pipapis kriyadunchu kanchiqllapa. Suqninchiq, suqninchiq atinchiqllapa yuyayta "imam alli, imam mana allichu" nishpa. Chayshina kaptin, shumaqta tiyashunllapa suq ayllushinalla. + +Quechua, Cusco +Llapa runan kay pachapi paqarin qispisqa, "libre" flisqa, allin kausaypi, chaninchasqa kausaypi kananpaq, yuyayniyoq, yachayniyoq runa kasqanman jina. Llapa runamasinwantaqmi wauqentin jina munanakunan. + +Quechua, Huamalíes-Dos de Mayo Huánuco +Lapan runakunapis yurikuyan librimi y wakinkaqkunanaw rispitashqa, mana jarukushqa kayänanpaq. Saynawmi runakunaqa yuriyan shumaq yarpayyuq, alitapis mana alitapis reqiykar y seqay kuyapäkuyyuq. Saymi runakuna ali kawakuyänan jukninwan jukninwanpis. + +Quechua, Huaylas Ancash +Meyqan nunapis manam pipa sirweqnin nuna kananpaqtsu yurikushqa. I nuna karninmi meyqan nunapis juk láyatsu kayanman derëchunkunachowpis. I yarpachakiyta yacharninmi i allita mana allita shonqonkunachow mákurninmi nunakuna jukninta wiyanakur kayanman. + +Quechua, Margos-Yarowilca-Lauricocha +Lapantsikunapis Iibrimi yurishqantsi. Bälintsimi y derëchuntsikunapis wakinkaqkunanoqlapami. Yarpaynintsikunapis kaykanmi runa mayintsikunawan juk wawqinoq kuyanakur kawapäkunantsipaq. + +Quechua, Northern Conchucos Ancash +Mayqan runapis manam pipa isklabun kananpaqtsu yurishqa. Y runa karninmi llapan runakuna iwal kayanman dirichunkunachawpis. Y yarpayta yacharninmi y allita mana allita shunqunkunachaw makurninmi runakuna huknin hukninta rispitanakur kayanman. + +Quechua, North Junín +Lapan runas kay pachachru nasimun juk rantisha runanuy mana pitas sirbinanpaqmi, alipa rikasha kananpaqmi, washasha kananpaqmi. Lapan runakunas nasipaakamun yarpayniyoqmi naatan tantiyayniyoqmi ima lutanta rurapaakurursi tantiyakunanpaq. Lapan runakunas kawapaakunaman juk wawqenuylam. + +Quechua, South Bolivian +Tukuy kay pachaman paqarimujkuna libres nasekuntu tukuypunitaj kikin obligacionesniycjllataj, jinakamalla honorniyojtaj atiyniyojtaj, chantaqa razonwantaj concienciawantaj dotasqa kasqankurayku, kawsaqe masipura jina, tukuy uj munakuyllapi kawsakunanku tian. + +Quechua (Unified Quichua, old Hispanic orthography) +Tucuy runacuna quishpirihuán huiñán, pactacunahuampes, pay pura, umahuán, ayahuán chay shucuna shina, chaymantami shuclla shina causangacuna. + +Quichua, Chimborazo Highland +Tukuy runakunami maypipash kishpirishka, sumaykaypi(dignidad) paktapakta +wacharin. Chay wawakunaka sumak yuyaykuna, tiksiyuyay (fundamental), +huntami kan; chaymantami runapuraka shukllashina tukushpa, yanaparishpa +kawsana kan. + +Rarotongan +Kua anau rangatira ia te tangata katoatoa ma te aiteite i te au tikaanga e te tu ngateitei tiratiratu.  Kua ki ia ratou e te mero kimi ravenga e te akavangakau e kia akono tetai i tetai, i roto i te vaerua piri anga taeake. + +Romagnolo +Tot j essèri umèn i nàs lébri e cumpagn in dignità e dirét. Lou i è dutid ad rasoun e ad cuscinza e i à da operè, ognun ti cunfrunt at ch'j ilt, sa sentimint ad fratelènza. + +Romani, Balkan +Savorre manuśa biandõn meste thaj barabar k-o demnipen aj k-e hakaja. Si len godi aj somzanipen thaj si len te trąden pen jekh karing o aver and-o vogi e phralimnasqoro. + +Romani, Balkan (1) +Sa e manušikane strukture bijandžona tromane thaj jekhutne ko digniteti thaj čapipa. Von si baxtarde em barvale gndaja thaj godžaja thaj trubun jekh avereja te kherjakeren ko vodži pralipaja. + +Romanian (1953) +Toate ființele umane se nasc libere și egale în demnitate și în drepturi. Ele sînt înzestrate cu rațiune și conștiință și trebuie să se comporte unele față de altele în spiritul fraternității. + +Romanian (1993) +Toate ființele umane se nasc libere și egale în demnitate și în drepturi. Ele sunt înzestrate cu rațiune și conștiință și trebuie să se comporte unele față de altele în spiritul fraternității. + +Romanian (2006) +Toate ființele umane se nasc libere și egale în demnitate și în drepturi. Ele sunt înzestrate cu rațiune și conștiință și trebuie să se comporte unele față de altele în spiritul fraternității. + +Romansch +Tuots umans naschan libers ed eguals in dignità e drets. Els sun dotats cun intellet e conscienza e dessan agir tanter per in uin spiert da fraternità. + +Romansch (Grischun) +Tut ils umans naschan libers ed eguals en dignitad ed en dretgs. Els èn dotads cun raschun e conscienza e duain agir in vers l’auter en spiert da fraternitad. + +Romansch (Puter) +Tuot ils umauns naschan libers ed eguels in dignited ed in drets. Els sun dotos cun radschun e conscienza e dessan agir ün invers l’oter in spiert da fraternited. + +Romansch (Surmiran) +Tot igls carstgangs neschan libers ed eguals an dignitad ed an dretgs. Els èn dotos cun raschung e schientscha e duessan ager l’egn vers l’oter an spiert da fraternitad. + +Romansch (Sursilvan) +Tut ils humans neschan libers ed eguals en dignitad ed en dretgs. Els ein dotai cun raschun e cunscienzia e duein agir in viers l’auter en spért da fraternitad. + +Romansch (Sutsilvan) +Tut igls humans neschan libers ad eguals an dignitad ad an dretgs. Els en dotos cun raschùn a cunzienzia a den agir egn anviers l’oter an spiert da fraternitad. + +Romansch (Vallader) +Tuot ils umans naschan libers ed eguals in dignità ed in drets. Els sun dotats cun radschun e conscienza e dessan agir ün invers l’oter in ün spiert da fraternità. + +Rundi +Abantu bose bavuka bishira bakizana kandi bangana mu gateka no mu ngingo zibubahiriza. Bafise ubwenge n'umutima kandi bategerezwa kwubahana nk'abavandimwe. + +Russian +Все люди рождаются свободными и равными в своем достоинстве и правах. Они наделены разумом и совестью и должны поступать в отношении друг друга в духе братства. + +Rwanda +Abantu bose bavuka aliko bakwiye agaciro no kwubahwa kimwe. Bose bavukana ubwenge n'umutima, bagomba kugilirana kivandimwe. + +Saami, North +Buot olbmot leat riegádan friddjan ja olmmošárvvu ja olmmošvuoigatvuođaid dáfus. Sii leat jierbmalaš olbmot geain lea oamedovdu ja sii gálggaše leat dego vieljačagat. + +Salar +Heme kishler hür der, haysiyet ma haklarde adil der, mantik ma vicdan var, kardeshlikden davraneshge. + +Samoan +O tagata soifua uma ua saoloto lo latou fananau mai, ma e tutusa o latou tulaga aloaia faapea a latou aia tatau. Ua faaeeina atu i a latou le mafaufau lelei ma le loto fuatiaifo ma e tatau ona faatino le agaga faauso i le va o le tasi i le isi, + +Sango +Adü âzo kûê yamba, ngâ âla lîngbi terê na lêgë tî nëngö-terê na tî ângangü. Ala kûê awara ndarä na börö-li sï âla lîngbi tî dutï na âmbâ tî âla gï na lêngö söngö. + +Sanskrit +सर्वे मानवाः स्वतन्त्राः समुत्पन्नाः वर्तन्ते अपि च, गौरवदृशा अधिकारदृशा च समानाः एव वर्तन्ते। एते सर्वे चेतना-तर्क-शक्तिभ्यां सुसम्पन्नाः सन्ति। अपि च, सर्वेऽपि बन्धुत्व-भावनया परस्परं व्यवहरन्तु। + +Sanskrit (Grantha) +𑌸𑌰𑍍𑌵𑍇 𑌮𑌾𑌨𑌵𑌾𑌃 𑌸𑍍𑌵𑌤𑌨𑍍𑌤𑍍𑌰𑌾𑌃 𑌸𑌮𑍁𑌤𑍍𑌪𑌨𑍍𑌨𑌾𑌃 𑌵𑌰𑍍𑌤𑌨𑍍𑌤𑍇 𑌅𑌪𑌿 𑌚, 𑌗𑍌𑌰𑌵𑌦𑍃𑌶𑌾 𑌅𑌧𑌿𑌕𑌾𑌰𑌦𑍃𑌶𑌾 𑌚 𑌸𑌮𑌾𑌨𑌾𑌃 𑌏𑌵 𑌵𑌰𑍍𑌤𑌨𑍍𑌤𑍇। 𑌏𑌤𑍇 𑌸𑌰𑍍𑌵𑍇 𑌚𑍇𑌤𑌨𑌾-𑌤𑌰𑍍𑌕-𑌶𑌕𑍍𑌤𑌿𑌭𑍍𑌯𑌾𑌂 𑌸𑍁𑌸𑌮𑍍𑌪𑌨𑍍𑌨𑌾𑌃 𑌸𑌨𑍍𑌤𑌿। 𑌅𑌪𑌿 𑌚, 𑌸𑌰𑍍𑌵𑍇𑌽𑌪𑌿 𑌬𑌨𑍍𑌧𑍁𑌤𑍍𑌵-𑌭𑌾𑌵𑌨𑌯𑌾 𑌪𑌰𑌸𑍍𑌪𑌰𑌂 𑌵𑍍𑌯𑌵𑌹𑌰𑌨𑍍𑌤𑍁। + +Sãotomense +Tudu nguê di mundu ca nancê livli e igual ni dignidade e ni dirêtu. Punda nen ca pensá e nen tê cunxensa, selá nen fé tudu cuá cu tençón de lumón. + +Sardinian, Logudorese +Totu sos èsseres umanos naschint lìberos e eguales in dinnidade e in deretos. Issos tenent sa resone e sa cussèntzia e depent operare s'unu cun s'àteru cun ispìritu de fraternidade. + +Saxon, Low +All de Minschen sünd frie un gliek an Wüürd un Rechten baren. Se hebbt Vernunft un een Geweten un se schüllt sik Bröder sien. + +Scots +Aw human sowels is born free and equal in dignity and richts. They are tochered wi mense and conscience and shuld guide theirsels ane til ither in a speirit o britherheid. + +Secoya +Si'apai aide'oyë kua'ye peoye kui'ne siayë'kë maka pa'iye kui'ne tutupaye koni, jaje kuasase'sëtepi kuaju'i'ne peoye ñese saiye pa'iji ko̱kaijë yekë paireje. + +Seraiki +سارے انسان ازادا تے حقوق تے عزت دے اعتبار نال ہکو ڄئے پیدا تھیندن ۔ قدرت ولوں انہاں کوں عقل تے سمجھ عطا تھیندی اے ۔ ہیں کیتے ہک ڋوجھے نال بھرپی داسلوک کرڻا چاہی دا اے ۔ + +Serbian (Cyrillic) +Сва људска бића рађају се слободна и једнака у достојанству и правима. Она су обдарена разумом и свешћу и треба једни према другима да поступају у духу братства. + +Serbian (Latin) +Sva ljudska bića rađaju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i svešću i treba jedni prema drugima da postupaju u duhu bratstva. + +Serer-Sine +Wiin we naa ñoowaa na adna, den fop mbodu no ke war na oxnu refna na den a jega o ngalaat umpi yiif um, le mbarin o meƭtootaa baa mbaag o ñoow den fop no fog. + +Seselwa Creole French +Nou tou imen nou’n ne dan laliberte ek legalite, dan nou dignite ek nou bann drwa. Nou tou nou annan kapasite pou rezonnen, e fodre nou azir anver lezot avek en lespri fraternel. + +Shan +ၵူၼ်းၵူႊၵေႃႉၼႆႉ ပဵၼ်ဢၼ်ၵိူတ်ႇမႃးလူၺ်ႈၵုင်ႇမုၼ်ဢၼ်လွတ်ႈလႅဝ်းၽဵင်ႇပဵင်းၵၼ် လႄႈ သုၼ်ႇလႆႈဢၼ် လွတ်ႈလႅဝ်းၽဵင်ႇ ပဵင်းၵၼ်။ ၶဝ်ၼႆႉ မီးၺၢၼ်ႇဢၼ်မေႃထတ်းသၢင် လႄႈ ၸႂ်ဢၼ်ႁူႉၸၵ်းၾိင်ႈတိုဝ်းၵမ် ၼၼ်ႉလႄႈ ထုၵ်ႇဝႆႉၸႂ်ပီႈဢွၵ်ႇ ၼွင်ႉၶႆႇၵၼ်သေ တိတ်းတေႃႇၵၼ်။ + +Sharanahua +Nantifin naanno rasisin cainnifoquin. Tsoan mato iscahuatiroma cuscan, -Manfin uhuunnacoinquin. Ahuua tsacatama rarama shara ninonfo ishon. Nantififain aton mapo shinantirofoquin. Ato nomuranrin chaca iyamarain sharamainqui icashon. Ascanrifiantan nantifin manifoti yorahuan tanannon icashu. + +Shilluk +Dhanhø bëne ba anywølø e path ki bäng, ge pär ki yij bëëdø geki dyërø. gïn-a dwaddi kiper gen yï gen da rumi ki bëëdø mø göög gen ki pyëw akyel ga nyimëgg. + +Shipibo-Conibo +Jatíbi joninra huetsa jonibaon yoiai nincáresti iqui, jahueraquibi jaconmai iamaquin; jainoash jahuen queena jacon jahuéquibo ati jahuequescamabi iqui, tsonbira amayamatima iqui. Jaticashbira jascara aresti jacon shinanya iti jahuequescamabi iqui, jahuequescarainoash picota joni inonbi. Huestiora huestiorabora jahuéqui ati shinanya iqui; jainshon onanribique jahueratoqui jacon iqui jainoash jaconma iqui ishon. Ja copira huetsa jonibires inonbi non jato jaconharesti iqui, non huetsabi non acai quescaaquin. + +Shona +Vanhu vese vanoberekwa vakasununguka uyewo vakaenzana pahunhu nekodzero dzavo. Vanhu vese vanechipo chokufunga nekuziva chakaipa nechakanaka saka vanofanira kubatana nomweya wohusahwira. + +Shor +Парчын кижи, по чарыққа туғчадып, тең, пош туғча. Кижилер сағыштығ, ақтығ туғчалар, кижилерге пашқа кижилербе арғыштаныштарға керек. + +Shuar +Aents yajá nunkanam akínia asamtaish, metekrak ainiaji. Tumasha ni chichamenka tuke amiktin aíniawai. Ni iniakmamuri, ní chichamejaituke aniakmamsar chichakartin aíniawai. Tuma asamtai aents mash nekawar, penker metekrak, nuamtak wará warat shiir pujusarmi tusar aárma awai. + +Sidamo +Manchi beetti kalaqamunni wolaphinoho. Ayirrinyunninna qoossotennino taaloho. Huwatanno tiiano kalaqamunni ba’raarinoha ikkasinni mittu wolu ledo rodiimmate ayyaaninni hee’ra noosi. + +Sinhala +සියලු මනුෂ්‍යයෝ නිදහස්ව උපත ලබා ඇත. ගරුත්වයෙන් හා අයිතිවාසිකම්වලින් සමාන වෙති. යුක්ති අයුක්ති පිළිබඳ හැඟීමෙන් හා හෘදය සාක්ෂියෙන් යුත් ඔවුන්, ඔවුනොවුන්ට සැළකිය යුත්තේ සහෝදරත්වය පිළිබඳ හැඟීමෙනි. + +Siona +Sia'bai̱ aideo'yë goa'ye beoye gu̱i'ne sia'yë'quë maca +bai'ye gu̱i'ne quëco baye co̱ni, ja̱je̱ goachase'sëte goa'ju̱i'ñe beoye ñese saiye +bai'ji co̱caijë yequë bai̱reje. + +Slovak +Všetci ľudia sa rodia slobodní a sebe rovní , čo sa týka ich dostojnosti a práv. Sú obdarení rozumom a majú navzájom jednať v bratskom duchu. + +Slovenian +Vsi ljudje se rodijo svobodni in imajo enako dostojanstvo in enake pravice. Obdarjeni so z razumom in vestjo in bi morali ravnati drug z drugim kakor bratje. + +Somali +Aadanaha dhammaantiis wuxuu dhashaa isagoo xor ah kana siman xagga sharafta iyo xuquuqada Waxaa Alle (Ilaah) siiyay aqoon iyo wacyi, waana in qof la arkaa qofka kale ula dhaqmaa si walaaltinimo ah. + +Soninke +Haadama renme su saareyen ŋa an na du-kitten ña, an nta sere komaaxu, an do soron su yan yekka dorontaaxu do taqu. Haqilen, wa sere su, a do soro kuttu nan siri terene doome kappalengaaxu kanma. + +Sorbian, Upper +Wšitcy čłowjekojo su wot naroda swobodni a su jenacy po dostojnosći a prawach. Woni su z rozumom a swědomjom wobdarjeni a maja mjezsobu w duchu bratrowstwa wobchadźeć. + +Sotho, Northern +Batho ka moka ba belegwe ba lokologile le gona ba na le seriti sa go lekana le ditokelo. Ba filwe monagano le letswalo mme ba swanetše go swarana ka moya wa bana ba mpa. + +Sotho, Southern +Batho bohle ba tswetswe ba lokolohile mme ba lekana ka botho le ditokelo. Ba tswetswe le monahano le letswalo mme ba tlamehile ho phedisana le ba bang ka moya wa boena. + +South Azerbaijani +Tüm insanlar hür döğarlar, hak ve onur bakımından eşit döğarlar, onlar akıl ve vicdana sahiptirler ve birbirlerine karşı kardeşlik ruhu içinde davranmalılar. + +Spanish +Todos los seres humanos nacen libres e iguales en dignidad y derechos y, dotados como están de razón y conciencia, deben comportarse fraternalmente los unos con los otros. + +Spanish (resolution) +Todos los seres humanos nacen libres e iguales en dignidad y derechos y, dotados como están de razón y conciencia, deben comportarse fraternalmente los unos con los otros. + +Sukuma +Banhu bose bakabyalagwa na wiyabi na bakabizaga na makujo na sekge jabo jilenganilile. Banhu bose bakabizaga na masala na buhabuji; hukuyomba balidakilwa gubi na witogwa gidi bana ba myaji umo. + +Sunda +Sakumna jalma gubrag ka alam dunya teh sifatna merdika jeung boga martabat katut hak-hak anu sarua . Maranehna dibere akal jeung hate nurani, campur-gaul jeung sasamana aya dina sumanget duduluran. + +Susu +Adamadie birin barixinɛ e lan yɛtɛralui kui, yɛtɛ kolonyi nun yɛtɛ suxu kima. Fondoe nun faxamui na e bɛ boresuxu kima bariboreya fanyi kui. + +Swahili +Watu wote wamezaliwa huru, hadhi na haki zao ni sawa. Wote wamejaliwa akili na dhamiri, hivyo yapasa watendeane kindugu. + +Swati +Bonkhe bantfu batalwa bakhululekile balingana ngalokufananako ngesitfunti nangemalungelo. Baphiwe ingcondvo nekucondza kanye nanembeza ngakoke bafanele batiphatse nekutsi baphatse nalabanye ngemoya webuzalwane. + +Swedish +Alla människor äro födda fria och lika i värde och rättigheter. De äro utrustade med förnuft och samvete och böra handla gentemot varandra i en anda av broderskap. + +Tagalog +Ang lahat ng tao'y isinilang na malaya at pantay-pantay sa karangalan at mga karapatan. Sila'y pinagkalooban ng katwiran at budhi at dapat magpalagayan ang isa't isa sa diwa ng pagkakapatiran. + +Tagalog (Tagalog) +ᜀᜅ ᜎᜑᜆ᜔ ᜅ ᜆᜂᜌ᜔ ᜁᜐᜒᜈᜒᜎᜅ ᜈ ᜋᜎᜌ ᜀᜆ᜔ ᜉᜈ᜔ᜆᜌ᜔ ᜉᜈ᜔ᜆᜌ᜔ ᜐ ᜃᜇᜅᜎᜈ᜔ ᜀᜆ᜔ ᜋ᜔ᜄ ᜃᜇᜓᜉᜆᜈ᜔᜶ ᜐᜒᜎᜌ᜔ ᜉᜒᜈᜄ᜔ᜃᜎᜓᜊᜈ᜔ ᜅ ᜃᜆ᜔ᜏᜒᜇᜈ᜔ ᜀᜆ᜔ ᜊᜓᜇ᜔ᜑᜒ ᜀᜆ᜔ ᜇᜉᜆ᜔ ᜋᜄ᜔ᜉᜎᜄᜌᜈ᜔ ᜀᜅ ᜁᜐᜆ᜔ ᜁᜐ ᜐ ᜇᜒᜏ ᜅ ᜉᜄ᜔ᜃᜃᜉᜆᜒᜇᜈ᜔᜶ + +Tahitian +E fanauhia te tā'āto'ara'a o te ta'ata-tupu ma te ti'amā e te ti'amanara'a 'aifaito. Ua 'ī te mana'o pa'ari e i te manava e ma te 'a'au taea'e 'oia ta ratou ha'a i rotopū ia ratou iho, e ti'a ai; + +Tai Dam +ꪹꪕꪸꪉ ꪀꪱ ꪋꪴ ꫛ ꪎꪲꪉ ꪮꪮꪀ ꪣꪱ ꪻꪠ ꪁꪷ ꪻꪬ ꪼꪒ ꪕꪳ ꪕꪱꪉ ꪀꪾꪚ ꪹꪋꪷꪉ ꪝꪸꪉ ꪕꪮꪥ ꪩꪾ ꫛ ꪶꪔꪙ ꪠꪴ - ꪋꪴ ꪬꪺ ꫛ ꪻꪠ ꪁꪷ ꪻꪬ ꪣꪲ ꪁꪫꪸꪙ ꪎꪱꪉ ꪶꪎꪣ ꪩꪺꪉ ꪹꪥꪸꪒ ꫛ ꪀꪾꪚ ꪹꪥꪸꪒ ꪻꪊ ꪚꪴꪙ ꪀꪾꪚ ꪼꪒ ꪹꪚꪷꪉ ꪒꪲ ꪀꪾꪚ ꪫꪸꪀ ꪭꪰꪀ ꪵꪝꪉ ꪹꪏꪉ ꪹꪭꪙ ꪒꪸꪫ. + +Tajiki +Тамоми одамон озод ва аз лиҳози шарафу ҳуқуқ ба ҳам баробар ба дунё меоянд. Онҳо соҳиби ақлу виҷдонанд ва бояд бо якдигар муносибати бародарона дошта бошанд. + +Talysh +Həmmə insonon bəştə ləyoğəti iyən həxonro ozod iyən bərobər movardə bedən. Çəvon şuur iyən vicdon hese, əve ki, deyəndı mınasibətədə bənə bıvə rəftor kardəninin. + +Tamang, Eastern +म्होक्कोन (गोदोप) नोन म्हीम केपान्हापा हेन्छे नुन हाङपाङवा (स्वतन्त्र) याङवा हीन्ना । थे म्होक्कोनला (गोदोपला) च्योच्यो याङताम थेन महत्व मुला । थेनीकादेरी सेमबाङ (विचार शक्ति) देन थु-सेमसाङ मुबासे थेनीजुगुसे ह्राङन्हाङरी नुन थेत्माला सेमलेङमोग्याम्से (भवनाबाट) ग्ये लातोबान मुला । + +Tamazight, Central Atlas +Imdanen, akken ma llan ttlalen d ilelliyen msawan di lḥweṛma d yizerfan- ghur sen tamsakwit d lâquel u yessefk ad-tili tegmatt gar asen. + +Tamazight, Central Atlas (Tifinagh) +ⵉⵎⴷⴰⵏⴻⵏ, ⴰⴽⴽⴻⵏ ⵎⴰ ⵍⵍⴰⵏ ⵜⵜⵍⴰⵍⴻⵏ ⴷ ⵉⵍⴻⵍⵍⵉⵢⴻⵏ ⵎⵙⴰⵡⴰⵏ ⴷⵉ ⵍⵃⵡⴻⵕⵎⴰ ⴷ ⵢⵉⵣⴻⵔⴼⴰⵏ-ⵖⵓⵔ ⵙⴻⵏ ⵜⴰⵎⵙⴰⴽⵡⵉⵜ ⴷ ⵍⴰⵇⵓⴻⵍ ⵓ ⵢⴻⵙⵙⴻⴼⴽ ⴰⴷ-ⵜⵉⵍⵉ ⵜⴻⴳⵎⴰⵜⵜ ⴳⴰⵔ ⴰⵙⴻⵏ. + +Tamazight, Standard Morocan +ⴰⵔ ⴷ ⵜⵜⵍⴰⵍⴰⵏ ⵎⵉⴷⴷⵏ ⴳⴰⵏ ⵉⵍⴻⵍⵍⵉⵜⵏ ⵎⴳⴰⴷⴷⴰⵏ ⵖ ⵡⴰⴷⴷⵓⵔ ⴷ ⵉⵣⵔⴼⴰⵏ, ⵢⵉⵍⵉ ⴰⴽⵯ ⴷⴰⵔⵙⵏ ⵓⵏⵍⵍⵉ ⴷ ⵓⴼⵔⴰⴽ, ⵉⵍⵍⴰ ⴼⵍⵍⴰ ⵙⵏ ⴰⴷ ⵜⵜⵎⵢⴰⵡⴰⵙⵏ ⵏⴳⵔⴰⵜⵙⵏ ⵙ ⵜⴰⴳⵎⴰⵜ. + +Tamil +மனிதப் பிறிவியினர் சகலரும் சுதந்திரமாகவே பிறக்கின்றனர்; அவர்கள் மதிப்பிலும், உரிமைகளிலும் சமமானவர்கள், அவர்கள் நியாயத்தையும் மனச்சாட்சியையும் இயற்பண்பாகப் பெற்றவர்கள். அவர்கள் ஒருவருடனொருவர் சகோதர உணர்வுப் பாங்கில் நடந்துகொள்ளல் வேண்டும். + +Tamil (Sri Lanka) +மனிதப் பிறிவியினர் சகலரும் சுதந்திரமாகவே பிறக்கின்றனர்; அவர்கள் மதிப்பிலும், உரிமைகளிலும் சமமானவர்கள், அவர்கள் நியாயத்தையும் மனச்சாட்சியையும் இயற்பண்பாகப் பெற்றவர்கள். அவர்கள் ஒருவருடனொருவர் சகோதர உணர்வுப் பாங்கில் நடந்துகொள்ளல் வேண்டும். + +Tatar +Барлык кешеләр дә азат һәм үз абруйлары һәм хокуклары ягыннан тиң булып туалар. Аларга акыл һәм вөҗдан бирелгән һәм бер-берсенә карата туганарча [туганнарча] мөнәсәбәттә булырга тиешләр. + +Telugu +ప్రతిపత్తిస్వత్వముల విషయమున మానవులెల్లరును జన్మతః స్వతంత్రులును సమానులును నగుదురు. వారు వివేచన-అంతఃకరణ సంపన్నులగుటచే పరస్పరము భ్రాతృభావముతో వర్తింపవలయును. + +Tem +Bánlʊrʊ́ʊ ɩrʊ́ báa weení na kezéńbíídi gɛ bɩka bɛdɛ́ɛ ɖɔɔzɩ́tɩ na yíkowá kɛgɛ́ɛ ɖéyí-ɖéyí gɛ. Bɔwɛná laakárɩ na ɩrʊ́tɩ bɩka bɩɩbɔ́ɔ́zɩ bɔcɔɔná ɖamá koobíre cɔwʊrɛ. + +Tetun +Ema hotu hotu moris hanesan ho dignidade ho direitu. Sira hotu iha hanoin, konsiensia n'e duni tenki hare malu hanesan espiritu maun-alin. + +Tetun Dili +Ema tomak moris hanesan, ema tomak hanesan, iha direitu hanesan. Ema tomak iha otak ho neon, hotu-hotu sei buka moris hanesan maun ho alin. + +Thai +มนุษย์ทั้งหลายเกิดมามีอิสระและเสมอภาคกันในเกียรติศักด[เกียรติศักดิ์]และสิทธิ ต่างมีเหตุผลและมโนธรรม และควรปฏิบัติต่อกันด้วยเจตนารมณ์แห่งภราดรภาพ + +Thai (2) +มนุษย์ทั้งปวงเกิดมามีอิสระและเสมอภาคกันในศักดิ์ศรีและสิทธิ ต่างในตนมีเหตุผลและมโนธรรม และควรปฏิบัติต่อกันด้วยจิตวิญญาณแห่งภราดรภาพ + + +Themne +A kom aŋfəm akəpet bɛ ŋa athənʌnɛ yi rʌwankom. Ɔwa aŋ ba məmari məthənʌnɛ. Ɔwa aŋ ba məfith yi təchemp. Chiyaŋ, aŋ yi təkə gbasi aŋkos ŋaŋ mɔ kəpa ŋa təkom. + +Tibetan, Central +འགྲོ་བ་མིའི་རིགས་རྒྱུད་ཡོངས་ལ་སྐྱེས་ཙམ་ཉིད་ནས་ཆེ་མཐོངས་དང༌། ཐོབ་ཐངགི་རང་དབང་འདྲ་མཉམ་དུ་ཡོད་ལ། ཁོང་ཚོར་རང་བྱུང་གི་བློ་རྩལ་དང་བསམ་ཚུལ་བཟང་པོ་འདོན་པའི་འོས་བབས་ཀྱང་ཡོད། དེ་བཞིན་ཕན་ཚུན་གཅིག་གིས་གཅིག་ལ་བུ་སྤུན་གྱི་འདུ་ཤེས་འཛིན་པའི་བྱ་སྤྱོད་ཀྱང་ལག་ལེན་བསྟར་དགོས་པ་ཡིན༎ + +Ticuna +Ngẽxguma nabuxgu i duü̃xü̃gü rü guxü̃ma nawüxigu, rü tataxuma ya texé ya togüarü yexera ixĩsẽ. Rü guxü̃ma naxããẽgü rü ngẽmaca̱x rü name nixĩ na nügümaã namecümaxü̃ ĩ guxü̃ma ĩ duü̃xü̃gü. + +Tigrigna +ብመንፅር ክብርን መሰልን ኩሎም ሰባት እንትውለዱ ነፃን ማዕሪን እዮም፡፡ ምስትውዓልን ሕልናን ዝተዓደሎም ብምዃኖም ንሕድሕዶም ብሕውነታዊ መንፈስ ክተሓላለዩ ኦለዎም፡፡ + +Tiv +I mar maor ken kpan ga, nan ngu a icivir man mbamkpeiyol cii. I na nan mhen man ishima i kaven kwagh; nahan gba keng u nana tema a orgen ken mtem u angbian a angbian. + +Toba +'Enauac na naaxat shiỹaxauapi na mayipi huesochiguii qataq 'eeta'a't da l'amaqchic qataq da 'enec qataq ỹataqta ỹaỹate'n naua lataxaco qataq nua no'o'n nvilỹaxaco, qaq ỹoqo'oyi iuen da i 'oonolec ỹataqta itauan ichoxoden ca lỹa + +Tojolabal +Spetsanal ja swinkil ja lu’um k’inali junxta wax jul schonjel, sok ja sijpanub’ali, ja yuj ojni b’ob’ sk’u’luk ja jas sk’ana-i ja b’as lekilali, ja yuj ja ay sk’ujoli sok ay spensari t’ilan oj yilsb’aje lek sok ja smoj jumasa. + +Tok Pisin +Yumi olgeta mama karim umi long stap fri na wankain long wei yumi lukim i gutpela na strepela tru. Uumi olgeta igat ting ting bilong wanem samting I rait na rong na mipela olgeta I mas mekim gutpela pasin long ol narapela long tingting bilong brata susa. + +Tonga +Bantu boonse balazyalwa kabaangulukide alimwi kabeelene alimwi akwaanguluka kucita zyobayanda. Balazyalwa amaanu akuyeeya, aakusala alimwi beelede kulanga bambi mbuli banabokwabo. + +Tongan +Ko e kotoa ‘o ha’a tangata ‘oku fanau’i mai ‘oku tau’ataina pea tatau ‘i he ngeia mo e ngaahi totonu. Na’e fakanaunau’i kinautolu ‘aki ‘a e ‘atamai mo e konisenisi pea ‘oku totonu ke nau feohi ‘i he laumalie ‘o e nofo fakatautehina. + +Totonac, Papantla +Wakg lakch'ixkuwin talakgawan nak ka'unin niti ka'akgch'apawalinit nachuna wakg takg'alhi ixtamaxanatkan chu tu kaminini, je'e wanp'utun xlakata wakg talakpuwanan, talalakgk'atsan liwakg, talakask'ini xlakata wakg natalamakgtaya. + +Tsonga (Mozambique) +Vanhu hin'kwavu va psaliwili na va khululekìle, funthsi va fana hi lisima ni tinfaneno. Và psaliwili ni nyiko ya ku pimisa ni ku yehleketa; hi kolahu, va fanela ku hanya hi moya wa umbìlu ni unghani. + +Tsonga (Zimbabwe) +Vanhu hinkwavo va tswariwa va tshunxekile naswona va ringanile eka tifanelo na xindzhuti. Va havaxerile miehleketo na tshiriti kumbe ku tiva xo biha ni xta kahle nakambe va fanele va kombana moya wa vukwavo. + +Tswana +Batho botlhe ba tsetswe ba gololosegile le go lekalekana ka seriti le ditshwanelo. Ba abetswe go akanya le maikutlo, mme ba tshwanetse go direlana ka mowa wa bokaulengwe. + +Turkish +Bütün insanlar hür, haysiyet ve haklar bakımından eşit doğarlar. Akıl ve vicdana sahiptirler ve birbirlerine karşı kardeşlik zihniyeti ile hareket etmelidirler. + +Turkmen (Cyrillic) +Хемме адамлар өз мертебеси ве хукуклары боюнча дең ягдайда дүнйә инйәрлер. Олара аң хем выҗдан берлендир ве олар бир‐бирлери билен доганлык рухундакы гарайышда болмалыдырлар. + +Turkmen (Latin) +Adamlaryň hemmesi azat dogulýarlar we öz mertebesi hem‐de hukuklary boýunça ilkibaşdan deňdirler. Olara ozal‐başdan aň, ynsap berlendir we biri‐birine özara doganlyk ruhunda çemeleşmek olaryň ýaraşygydyr. + +Tuva +Бүгү кижилер хостуг база мөзүзү болгаш эргелери дең кылдыр төрүттүнер. Оларга угаансарыыл болгаш арын-нүүр бердинген болур болгаш олар бот-боттарынга акы-дуңмалышкы хамаарылганы көргүзер ужурлуг. + +Twi (Akuapem) +Wɔawo adesamma nyinaa sɛ nnipa a wɔwɔ ahofadi. Wɔn nyinaa wɔ nidi ne kyɛfa koro. Wɔwɔ adwene ne ahonim, na ɛsɛ sɛ wobu wɔn ho wɔn ho sɛ anuanom. + +Twi (Asante) +Nnipa nyinaa yɛ pɛ. Na wɔde adwene ne nyansa na abɔ obiara. Ɛno nti, ɛsɛ sɛ obiara dɔ ne yɔnko, bu ne yɔnko, di ne yɔnko ni. + +Tzeltal, Oxchuc +Spisil winiketik te ya xbejk´ajik ta k´inalil ay jrerechotik, mayuk mach´a chukul ya xbejka, ya jnatik stojol te jpisiltik ay snopibal sok sbijil joltik, ja´ me k´ux ya kaibatik ta jujun tul. + +Tzotzil (Chamula) +Skotol vinik o ants ta spejel balumile k’olem x-hayan i ko’ol ta sch’ulal i sderechoetik i, skotol k’ux-elan oyike oy srasonik y slekilalik, sventa skuxijik leknóo ta ju jun ju ju vo. + +Uduk +Aris ’kwaniny’ceshi ’baar mo dho’thkunu ’baḵany mo dhali mmomiiya ṯu’c imonṯal ’de/ mo dhali mii ma ḵar/e mo. Uni mini ta gi gwo mo dhali mii mo dhali uni mini mii ka karambuye/ ’kup̱ ki cin tiya mo e shi/in mo dhali mii kun tanu ikam mo. + +Ukrainian +Всі люди народжуються вільними і рівними у своїй гідності та правах. Вони наділені розумом і совістю і повинні діяти у відношенні один до одного в дусі братерства. + +Umbundu +Omanu vosi vacitiwa valipwa kwenda valisoka kovina vyosikwenda komoko. Ovo vakwete esunga kwenda, kwenda olondunge kwenje ovo vatêla okuliteywila kuvamwe kwenda vakwavo vesokolwilo lyocisola. + +Umbundu (011) +Omanu vosi kilu lieve va citiwa lonjila yimosi leyovo limosi, lomoko yimosi kuenda unu umosi, kuenje momo vosi va kuete olondunge, va sesamela okulisumbila pokati ndavamanji. + +Urarina +Ita rijiicha itolere cacha. Aihana jaun, ita belaain, naojoain neuruhine laurilaurichuru nenacaauru aina itolere cachaauru. + +Urdu +تمام انسان آزاد اور حقوق و عزت کے اعتبار سے برابر پیدا ہوئے ہیں۔ انہیں ضمیر اور عقل ودیعت ہوئی ہے۔ اس لئے انہیں ایک دوسرے کے ساتھ بھائی چارے کا سلوک کرنا چاہیئے۔ + +Urdu (2) +تمام انسان آزاد اور حقوق و عزت کے اعتبار سے برابر پیدا ہوئے ہیں۔ انہیں ضمیر اور عقل ودیعت ہوئی ہے۔ اس لیے انہیں ایک دوسرے کے ساتھ بھائی چارے کا سلوک کرنا چاہیے۔ + +Uyghur (Arabic) +ھەممە ئادەم زانىدىنلا ئەركىن، ئىززەت-ھۆرمەت ۋە ھوقۇقتا باپباراۋەر بولۇپ تۇغۇلغان. ئۇلار ئەقىلغە ۋە ۋىجدانغا ئىگە ھەمدە بىر-بىرىگە قېرىنداشلىق مۇناسىۋىتىگە خاس روھ بىلەن موئامىلە قىلىشى كېرەك. + +Uyghur (Latin) +hemme adem zatidinla erkin, izzet-hörmet we hoquqta babbarawer bolup tughulghan. ular eqilghe we wijdan'gha ige hemde bir-birige qérindashliq munasiwitige xas roh bilen muamile qilishi kérek. + +Uzbek, Northern (Cyrillic) +Барча одамлар эркин, қадр‐қиммат ва ҳуқуқларда тенг бўлиб туғиладилар. Улар ақл ва виждон соҳибидирлар ва бир‐бирларига биродарларча муомала қилишлари зарур. + +Uzbek, Northern (Latin) +Barcha odamlar erkin, qadr‐qimmat va huquqlarda teng boʻlib tugʻiladilar. Ular aql va vijdon sohibidirlar va bir‐birlariga birodarlarcha muomala qilishlari zarur. + +Vai +ꕉꕜꕮ ꔔꘋ ꖸ ꔰ ꗋꘋ ꕮꕨ ꔔꘋ ꖸ ꕎ ꕉꖸꕊ ꕴꖃ ꕃꔤꘂ ꗱ, ꕉꖷ ꗪꗡ ꔻꔤ ꗏꗒꗡ ꕎ ꗪ ꕉꖸꕊ ꖏꕎ. ꕉꕡ ꖏ ꗳꕮꕊ ꗏ ꕪ ꗓ ꕉꖷ ꕉꖸ ꕘꕞ ꗪ. ꖏꖷ ꕉꖸꔧ ꖏ ꖸ ꕚꕌꘂ ꗷꔤ ꕞ ꘃꖷ ꘉꔧ ꗠꖻ ꕞ ꖴꘋ ꔳꕩ ꕉꖸ ꗳ. + +Venda +Vhathu vhoṱhe vha bebwa vhe na mbofholowo nahone vha tshi lingana siani ḽa tshirunzi na pfanelo. Vhathu vhoṱhe vho ṋewa mihumbulo na mvalo ngauralo vha tea u konou farana sa vhathu vhathihi. + +Venda +Vhathu vhoṱhe vha bebwa vhe na mbofholowo nahone vha tshi lingana siani ḽa tshirunzi na pfanelo. Vhathu vhoṱhe vho ṋewa mihumbulo na mvalo ngauralo vha tea u konou farana sa vhathu vhathihi. + +Venetian +Tuti i èsari umani i nase łìbari e conpanji par dinjità e deriti. I ze dotài de rajon e de cosiensa e i ga da conportarse intrà de łori co spìrito de fradełi. + +Veps +Kaik mehed sünduba joudajin i kohtaižin, ühtejiččin ičeze arvokahudes i oiktusiš. Heile om anttud mel’ i huiktusentund i heile tariž kožuda toine toiženke kut vel’l’kundad. + +Vietnamese +Tất cả mọi người sinh ra đều được tự do và bình đẳng về nhân phẩm và quyền. Mọi con người đều được tạo hoá ban cho lý trí và lương tâm và cần phải đối xử với nhau trong tình bằng hữu. + +Vietnamese (Han nom) +畢哿每𠊛生𠚢調得自由吧平等𧗱人品吧權。每𡥵𠊛調得造化頒朱理智吧良心吧勤沛對處𢭲膮𥪝情朋友。 + +Waama +Yiriba na bà sikindo dare bà mɛɛri, da seena yirimma mii bà ta da i nɛki bà tɔɔba. + +Walloon +Tos lès-omes vinèt-st-å monde lîbes, èt so-l'minme pîd po çou qu'ènn'èst d'leu dignité èt d'leus dreûts. I n'sont nin foû rêzon èt-z-ont-i leû consyince po zèls, çou qu'èlzès deût miner a s'kidûre onk' po l'ôte tot come dès frés. + +Waorani +Tomamo waomo ekame wee anamay inani tomemo kewengi beye tomamo +neemompa noynga impa aye anobay impa wadani inanite wakeki beye +angampa. + +Waray-Waray +Nga an ngatanan nga mga tawo, nahimugso talwas ug katpong ha ira dignidad ug katdungan. Hira natawo dinhi ha tuna mayda konsensya ug isip ug kaangayan gud la nga an ira pagtagad ha tagsatagsa sugad hin magburugto. + +Wayuu +Naa wayuukana jemeishi süpüla taashi süma wanawa sülu'u nakua'ipa, aka müin yaa epijainjana sünain anajiranawaa a'in nama napüshi. + +Welsh +Genir pawb yn rhydd ac yn gydradd â’i gilydd mewn urddas a hawliau. Fe’u cynysgaeddir â rheswm a chydwybod, a dylai pawb ymddwyn y naill at y llall mewn ysbryd cymodlon. + +Wolof +Doomi aadama yépp danuy juddu, yam ci tawfeex ci sag ak sañ-sañ. Nekk na it ku xam dëgg te ànd na ak xelam, te war naa jëflante ak nawleen, te teg ko ci wàllu mbokk. + +Xhosa +Bonke abantu bazalwa bekhululekile belingana ngesidima nangokweemfanelo. Bonke abantu banesiphiwo sesazela nesizathu sokwenza isenzo ongathanda ukuba senziwe kumzalwane wakho. + +Yagua +Ne sarupay nijyami cumudeju darvantyamuy javatyasjiu. Jachipiyadati mirvara samirva, mirvamuy ne samirva. Ramunltiy sarivichanichara samirvariy jityunu vichavay. + +Yakut +Дьон барыта бэйэ суолтатыгар уонна быраабыгар тэҥ буолан төрүүллэр. Кинилэр бары өркөн өйдөөх, суобастаах буолан төрүүллэр, уонна бэйэ бэйэлэригэр тылга кииринигэс быһыылара доҕордоһуу тыыннаах буолуохтаах. + +Yaneshaʼ +Allohueney ñeñtey arromñatey att̃o ye'ñalletyesa arr patsro e'ñe att̃ecma cohuen yesherb̃a'yen. Ñam̃a yechyen allpon derechos att̃och e'ñech cohueno'tsa'yeney arr patsro. Ñam̃a allohuen att̃ecma yechyen alloch yoct̃ape' chyen cohuen ñam̃a yeñotyen yeyoc̈hro ñeñt ̃e'ne pocte' enten acheñenesha' ñam̃a ñeñt ̃ama pocteye' enteneto. Yeñoteñ añ poctetsa e'ñe yemo'nasheñ yep̃annena ama't ora allohuen allpon acheñenesha' ñeñt ̃añe patsro'tsa'yeney. + +Yanomamö +Kõmi thë pë rë përiprawë rë piyëkëi, he usukuwë thë pë keprou ai thë ã rëamaihã no ã heparohowë, totihitawë thë pë riã rẽ thaiwehei hami, thë pë puhi tao kãi përihiwëha, thë pë puhi kãi katehewëha hawë kama thë pë mashi shĩro përihimopë. + +Yao +Wandu wosope akasapagwa ni ufulu ni uchimbichimbi wakulandana. Asapagwa ni lunda, niwakupakombola ganisya, m'yoyo kukusosekwa kuti mundu jwalijose am'woneje mundu jwimwe mpela mlongomjakwe. + +Yapese +Gubine gidii mani gargeleg nga faileng nibapuf mattʼawen nge rogon. Bay laniyan nipii e nam, ere ngauda ted mattʼaawen e chaa niba chugur ngoded nimod walag dad. + +Yiddish, Eastern +יעדער מענטש װערט געבױרן פֿרײַ און גלײַך אין כּבֿוד און רעכט. יעדער װערט באַשאָנקן מיט פֿאַרשטאַנד און געװיסן; יעדער זאָל זיך פֿירן מיט אַ צװײטן אין אַ געמיט פֿון ברודערשאַפֿט. + +Yoruba +Gbogbo ènìyàn ni a bí ní òmìnira; iyì àti ẹ̀tọ́ kọ̀ọ̀kan sì dọ́gba. Wọ́n ní ẹ̀bùn ti làákàyè àti ti ẹ̀rí‐ọkàn, ó sì yẹ kí wọn ó máa hùwà sí ara wọn gẹ́gẹ́ bí ọmọ ìyá. + +Yukaghir, Northern +Көдэҥ тэн - ньидитэ бандьэ параԝааньэрэҥ тудэ чуҥдэн ньилдьилэк эннулҥинь-мэдьуолнуни. Көдэҥ энмун чундэ мэ льэй, таатльэр лукундьии ньинэмдьийилпэ дитэ эннуйуол-мораԝньэҥи. + +Záparo +Kawiriaja kayapuina ichaukui ta nuka pucha panicha kupanimajicha cha nuka nishima ikicha kiniana panicha tamanuka kanata ikimajicha. + +Zapotec, Güilá +Ra'ta ra bu:unny ra:aaly liebr cëhnn te'bloh deree'ch cëhnn dignidaa. Ra:alyne:erih gahll ri:e:eny cëhnn saalyb, chiru' na:a pahr ga:annza'crih loh sa'rih. + +Zapotec, Miahuatlán +Diti mien ndied xa yent kuan nkie xa nak rieti xa diba xa rola. + +Zarma +Fayanka kulu no si adamayzey nda care game ra i burcintara nda i alhakey cediraw kayandiyaŋ fondo ra da i na i hay. I gonda lakkal, nda laasaabu, kaŋ ga naŋ i ma baafunay ɲayzetaray haali ra. + +Zhuang, Yongbei +Boux boux ma daengz lajmbwn couh miz cwyouz, cinhyenz caeuq genzli bouxboux Bingzdaengj. gyoengq vunz miz lijsing caeuq liengzsim, wngdang daih gyoengq de lumj beixnuengx ityiengh. + +Zulu +Bonke abantu bazalwa bekhululekile belingana ngesithunzi nangamalungelo. Bahlanganiswe wumcabango nangunembeza futhi kufanele baphathane ngomoya wobunye. + + + +------ diff --git a/example-docs/language-docs/eng_afr_spa.txt b/example-docs/language-docs/eng_afr_spa.txt new file mode 100644 index 0000000000..e589f238d1 --- /dev/null +++ b/example-docs/language-docs/eng_afr_spa.txt @@ -0,0 +1,5 @@ +All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. Spanish: "Todos los seres humanos nacen libres e iguales en dignidad y derechos y, dotados como están de razón y conciencia, deben comportarse fraternalmente los unos con los otros." Africaans: "Alle menslike wesens word vry, met gelyke waardigheid en regte, gebore. Hulle het rede en gewete en behoort in die gees van broederskap teenoor mekaar op te tree." All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. + +All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. + +All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. diff --git a/example-docs/language-docs/eng_spa.txt b/example-docs/language-docs/eng_spa.txt new file mode 100644 index 0000000000..16a4ca10c0 --- /dev/null +++ b/example-docs/language-docs/eng_spa.txt @@ -0,0 +1,5 @@ +All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. "Todos los seres humanos nacen libres e iguales en dignidad y derechos y, dotados como están de razón y conciencia, deben comportarse fraternalmente los unos con los otros." All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. + +All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. + +All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. diff --git a/example-docs/language-docs/eng_spa_mult.txt b/example-docs/language-docs/eng_spa_mult.txt new file mode 100644 index 0000000000..ceb7629165 --- /dev/null +++ b/example-docs/language-docs/eng_spa_mult.txt @@ -0,0 +1,9 @@ +All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. + +All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. "Todos los seres humanos nacen libres e iguales en dignidad y derechos y, dotados como están de razón y conciencia, deben comportarse fraternalmente los unos con los otros. Todos los seres humanos nacen libres e iguales en dignidad y derechos y, dotados como están de razón y conciencia, deben comportarse fraternalmente los unos con los otros." + +All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. + +All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. + +"Todos los seres humanos nacen libres e iguales en dignidad y derechos y, dotados como están de razón y conciencia, deben comportarse fraternalmente los unos con los otros. Todos los seres humanos nacen libres e iguales en dignidad y derechos y, dotados como están de razón y conciencia, deben comportarse fraternalmente los unos con los otros." From b9fa20ab461c64bdf876d4ae9adab85d7cb13ae9 Mon Sep 17 00:00:00 2001 From: qued <64741807+qued@users.noreply.github.com> Date: Fri, 6 Oct 2023 13:49:03 -0500 Subject: [PATCH 2/3] fix: isolate metadata imports to doctype (#1671) In a different PR, some no-extras tests started failing with import errors when something innocuous was imported from `unstructured.file_utils.metadata`. This turned out to be because of the top-level, doctype-specific imports in that file. Importing a general metadata object shouldn't require installation of modules like `PIL`, `docx`, and `openpyxl`. To fix, I moved these functions to be imported inside the functions that use them, and added the `requires_dependencies` decorator to the functions. #### Testing: You should be able to run something like: ```python from unstructured.file_utils.metadata import Metadata ``` Without `openpyxl` installed. --- CHANGELOG.md | 5 +++-- unstructured/__version__.py | 2 +- unstructured/file_utils/metadata.py | 15 +++++++++++---- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 98566c7786..68938ae3e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,8 @@ -## 0.10.20-dev4 +## 0.10.20-dev5 ### Enhancements -* **Align to top left when shrinking bounding boxes for `xy-curt` sorting:** Update `shrink_bbox()` to keep top left rather than center +* **Align to top left when shrinking bounding boxes for `xy-cut` sorting:** Update `shrink_bbox()` to keep top left rather than center. * **Add visualization script to annotate elements** This script is often used to analyze/visualize elements with coordinates (e.g. partition_pdf()). * **Adds data source properties to the Jira connector** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. * **Improve title detection in pptx documents** The default title textboxes on a pptx slide are now categorized as titles. @@ -15,6 +15,7 @@ setting UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true is needed. ### Fixes +* **Fix prevent metadata module from importing dependencies from unnecessary modules** Problem: The `metadata` module had several top level imports that were only used in and applicable to code related to specific document types, while there were many general-purpose functions. As a result, general-purpose functions couldn't be used without unnecessary dependencies being installed. Fix: moved 3rd party dependency top level imports to inside the functions in which they are used and applied a decorator to check that the dependency is installed and emit a helpful error message if not. * **Fixes category_depth None value for Title elements** Problem: `Title` elements from `chipper` get `category_depth`= None even when `Headline` and/or `Subheadline` elements are present in the same page. Fix: all `Title` elements with `category_depth` = None should be set to have a depth of 0 instead iff there are `Headline` and/or `Subheadline` element-types present. Importance: `Title` elements should be equivalent html `H1` when nested headings are present; otherwise, `category_depth` metadata can result ambiguous within elements in a page. * **Tweak `xy-cut` ordering output to be more column friendly** This results in the order of elements more closely reflecting natural reading order which benefits downstream applications. While element ordering from `xy-cut` is usually mostly correct when ordering multi-column documents, sometimes elements from a RHS column will appear before elements in a LHS column. Fix: add swapped `xy-cut` ordering by sorting by X coordinate first and then Y coordinate. * **Fixes badly initialized Formula** Problem: YoloX contain new types of elements, when loading a document that contain formulas a new element of that class diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 085ba4d4be..adcfc625cb 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.20-dev4" # pragma: no cover +__version__ = "0.10.20-dev5" # pragma: no cover diff --git a/unstructured/file_utils/metadata.py b/unstructured/file_utils/metadata.py index e58831b2ce..20e4476287 100644 --- a/unstructured/file_utils/metadata.py +++ b/unstructured/file_utils/metadata.py @@ -3,10 +3,7 @@ from dataclasses import dataclass, field from typing import IO, Any, Dict, Final, Optional -import docx -import openpyxl -from PIL import Image -from PIL.ExifTags import TAGS +from unstructured.utils import requires_dependencies # NOTE(robison) - ref: https://www.media.mit.edu/pia/Research/deepview/exif.html EXIF_DATETIME_FMT: Final[str] = "%Y:%m:%d %H:%M:%S" @@ -39,11 +36,14 @@ def to_dict(self): return self.__dict__ +@requires_dependencies("docx") def get_docx_metadata( filename: str = "", file: Optional[IO[bytes]] = None, ) -> Metadata: """Extracts document metadata from a Microsoft .docx document.""" + import docx + if filename: doc = docx.Document(filename) elif file: @@ -72,11 +72,14 @@ def get_docx_metadata( return metadata +@requires_dependencies("openpyxl") def get_xlsx_metadata( filename: str = "", file: Optional[IO[bytes]] = None, ) -> Metadata: """Extracts document metadata from a Microsoft .xlsx document.""" + import openpyxl + if filename: workbook = openpyxl.load_workbook(filename) elif file: @@ -106,11 +109,15 @@ def get_xlsx_metadata( return metadata +@requires_dependencies("PIL") def get_jpg_metadata( filename: str = "", file: Optional[IO[bytes]] = None, ) -> Metadata: """Extracts metadata from a JPG image, including EXIF metadata.""" + from PIL import Image + from PIL.ExifTags import TAGS + if filename: image = Image.open(filename) elif file: From 2e1404e02cb37b9c8b89a9fd8ee06c23aba39999 Mon Sep 17 00:00:00 2001 From: Roman Isecke <136338424+rbiseck3@users.noreply.github.com> Date: Fri, 6 Oct 2023 14:49:29 -0400 Subject: [PATCH 3/3] refactor: unstructured ingest as a pipeline (#1551) ### Description As we add more and more steps to the pipeline (i.e. chunking, embedding, table manipulation), it would help seperate the responsibility of each of these into their own processes, running each in parallel using json files to share data across. This will also help guarantee data is serializable if this code was used in an actual pipeline. Following is a flow diagram of the proposed changes. As part of this change: * A parent pipeline class will be responsible for running each `node`, which can optionally be run via multiprocessing if it supports it, or not. Possible nodes at this moment: * Doc factory: creates all the ingest docs via the source connector * Source: reads/downloads all of the content to process to the local filesystem to the location set by the `download_dir` parameter. * Partition: runs partition on all of the downloaded content in json format. * Any number of reformat nodes that modify the partitioned content. This can include chunking, embedding, etc. * Write: push the final json into the destination via the destination connector * This pipeline relies on the information of the ingest docs to be available via their serialization. An optimization was introduced with the `IngestDocJsonMixin` which adds in all the `@property` fields to the serialized json already being created via the `DataClassJsonMixin` * For all intermediate steps (partitioning, reformatting), the content is saved to a dedicated location on the local filesystem. Right now it's set to `$HOME/.cache/unstructured/ingest/pipeline/STEP_NAME/`. * Minor changes: made sense to move some of the config parameters between the read and partition configs when I explicitly divided the responsibility to download vs partition the content in the pipeline. * The pipeline class only makes the doc factory, source and partition nodes required, keeping with the logic that has been supported so far. All reformatting nodes and write node are optional. * Long term, there should also be some changes to the base configs supported by the CLI to support pipeline specific configs, but for now what exists was used to minimize changes in this PR. * Final step to copy the final output to the location designated by the `_output_filename` value of the ingest doc. * Hashing occurs at each step by hashing the parameters of that step (i.e. partition configs) along with the previous step via the filename used. This allows each step to be the same _if_ all the parameters for it have not changed and the content so far is the same. * The only data that is shared and has writes to across processes is the dictionary of ingest json data. This dict is created using the `multiprocessing.manager.DictProxy` to make sure any interaction with it is behind a lock. ### Minor refactors included: * Utility methods added to extract configs from the click options * Utility method to add common options to click commands. * All writers moved to using the class approach which extracts a lot of the common code so there's less copy-paste when new runners are added. * Use `@property` for source metadata on base ingest doc to add logic to call `update_source_metadata` if it's still `None` at the time it's fetched. ### Additional bug fixes included * Fsspec connectors were not serializable due to the `ingest_doc_cls`. This was removed from the fields captured by the `@dataclass` decorator and added in a `__post_init__` method. * Various reddit connector params were missing. This doesn't have an explicit ingest test at the moment so was never caught. * Fsspec connector had the parent `update_source_metadata` misnamed as `update_source_metadata_metadata` so it was never being called. ### Flow Diagram ![ingest_pipeline](https://github.com/Unstructured-IO/unstructured/assets/136338424/be485606-cfe0-4931-8b81-c2bf569cf1e2) --- .gitignore | 2 + CHANGELOG.md | 6 +- Dockerfile | 2 +- docs/requirements.txt | 4 +- requirements/base.txt | 4 +- requirements/build.txt | 4 +- requirements/constraints.in | 1 + requirements/dev.txt | 231 ++++++++++++------ requirements/extra-paddleocr.txt | 10 +- requirements/extra-pdf-image.txt | 22 +- requirements/huggingface.txt | 17 +- requirements/ingest-airtable.txt | 2 +- .../ingest-azure-cognitive-search.txt | 2 +- requirements/ingest-azure.txt | 2 +- requirements/ingest-box.txt | 2 +- requirements/ingest-confluence.txt | 2 +- requirements/ingest-dropbox.txt | 2 +- requirements/ingest-elasticsearch.txt | 2 +- requirements/ingest-gcs.txt | 2 +- requirements/ingest-github.txt | 2 +- requirements/ingest-gitlab.txt | 2 +- requirements/ingest-google-drive.txt | 4 +- requirements/ingest-jira.txt | 2 +- requirements/ingest-onedrive.txt | 2 +- requirements/ingest-openai.txt | 11 +- requirements/ingest-outlook.txt | 2 +- requirements/ingest-reddit.txt | 4 +- requirements/ingest-s3.txt | 2 +- requirements/ingest-salesforce.txt | 4 +- requirements/ingest-sharepoint.txt | 2 +- requirements/ingest-slack.txt | 2 +- requirements/ingest-wikipedia.txt | 2 +- requirements/test.txt | 8 +- .../test-ingest-against-api.sh | 11 +- .../test-ingest-airtable-diff.sh | 5 +- .../test-ingest-airtable-large.sh | 5 +- .../test-ingest-azure-cognitive-search.sh | 13 +- test_unstructured_ingest/test-ingest-azure.sh | 5 +- .../test-ingest-biomed-api.sh | 3 + .../test-ingest-biomed-path.sh | 3 + test_unstructured_ingest/test-ingest-box.sh | 5 +- .../test-ingest-confluence-diff.sh | 5 +- .../test-ingest-confluence-large.sh | 3 + .../test-ingest-delta-table.sh | 3 + .../test-ingest-discord.sh | 3 + .../test-ingest-dropbox.sh | 5 +- .../test-ingest-elasticsearch.sh | 5 +- test_unstructured_ingest/test-ingest-gcs.sh | 5 +- .../test-ingest-github.sh | 3 + .../test-ingest-gitlab.sh | 5 +- .../test-ingest-google-drive.sh | 5 +- test_unstructured_ingest/test-ingest-jira.sh | 5 +- ...-ingest-local-single-file-with-encoding.sh | 10 +- ...gle-file-with-pdf-infer-table-structure.sh | 10 +- .../test-ingest-local-single-file.sh | 10 +- test_unstructured_ingest/test-ingest-local.sh | 10 +- .../test-ingest-notion.sh | 5 +- .../test-ingest-onedrive.sh | 3 + .../test-ingest-outlook.sh | 6 +- .../test-ingest-pdf-fast-reprocess.sh | 5 +- .../test-ingest-s3-minio.sh | 5 +- test_unstructured_ingest/test-ingest-s3.sh | 5 +- .../test-ingest-salesforce.sh | 10 +- .../test-ingest-sharepoint-embed-cog-index.sh | 3 + .../test-ingest-sharepoint.sh | 3 + test_unstructured_ingest/test-ingest-slack.sh | 5 +- .../test-ingest-wikipedia.sh | 5 +- test_unstructured_ingest/test-ingest.sh | 2 +- .../unit/doc_processor/test_generalized.py | 52 ---- .../unit/test_interfaces.py | 39 +-- test_unstructured_ingest/unit/test_paths.py | 12 +- unstructured/ingest/cli/cmds/airtable.py | 24 +- unstructured/ingest/cli/cmds/azure.py | 26 +- .../ingest/cli/cmds/azure_cognitive_search.py | 46 +--- unstructured/ingest/cli/cmds/biomed.py | 24 +- unstructured/ingest/cli/cmds/box.py | 26 +- unstructured/ingest/cli/cmds/confluence.py | 29 +-- unstructured/ingest/cli/cmds/delta_table.py | 41 ++-- unstructured/ingest/cli/cmds/discord.py | 29 +-- unstructured/ingest/cli/cmds/dropbox.py | 26 +- unstructured/ingest/cli/cmds/elasticsearch.py | 24 +- unstructured/ingest/cli/cmds/fsspec.py | 23 +- unstructured/ingest/cli/cmds/gcs.py | 26 +- unstructured/ingest/cli/cmds/github.py | 24 +- unstructured/ingest/cli/cmds/gitlab.py | 24 +- unstructured/ingest/cli/cmds/google_drive.py | 25 +- unstructured/ingest/cli/cmds/jira.py | 29 +-- unstructured/ingest/cli/cmds/local.py | 25 +- unstructured/ingest/cli/cmds/notion.py | 30 +-- unstructured/ingest/cli/cmds/onedrive.py | 25 +- unstructured/ingest/cli/cmds/outlook.py | 30 +-- unstructured/ingest/cli/cmds/reddit.py | 37 +-- unstructured/ingest/cli/cmds/s3.py | 53 ++-- unstructured/ingest/cli/cmds/salesforce.py | 30 +-- unstructured/ingest/cli/cmds/sharepoint.py | 33 +-- unstructured/ingest/cli/cmds/slack.py | 29 +-- unstructured/ingest/cli/cmds/wikipedia.py | 24 +- unstructured/ingest/cli/interfaces.py | 115 +++++++-- unstructured/ingest/cli/{cmds => }/utils.py | 80 +++--- unstructured/ingest/connector/airtable.py | 4 +- unstructured/ingest/connector/azure.py | 4 +- unstructured/ingest/connector/biomed.py | 10 +- unstructured/ingest/connector/box.py | 4 +- unstructured/ingest/connector/confluence.py | 4 +- unstructured/ingest/connector/delta_table.py | 5 +- unstructured/ingest/connector/discord.py | 7 +- unstructured/ingest/connector/dropbox.py | 8 +- .../ingest/connector/elasticsearch.py | 4 +- unstructured/ingest/connector/fsspec.py | 25 +- unstructured/ingest/connector/gcs.py | 4 +- unstructured/ingest/connector/git.py | 2 +- unstructured/ingest/connector/github.py | 2 +- unstructured/ingest/connector/gitlab.py | 2 +- unstructured/ingest/connector/google_drive.py | 4 +- unstructured/ingest/connector/jira.py | 4 +- unstructured/ingest/connector/local.py | 4 +- .../ingest/connector/notion/connector.py | 13 +- unstructured/ingest/connector/onedrive.py | 4 +- unstructured/ingest/connector/outlook.py | 4 +- unstructured/ingest/connector/reddit.py | 4 +- unstructured/ingest/connector/s3.py | 4 +- unstructured/ingest/connector/salesforce.py | 40 ++- unstructured/ingest/connector/sharepoint.py | 32 +-- unstructured/ingest/connector/slack.py | 7 +- unstructured/ingest/connector/wikipedia.py | 12 +- unstructured/ingest/doc_processor/__init__.py | 0 .../ingest/doc_processor/generalized.py | 70 ------ unstructured/ingest/ingest_doc_json_mixin.py | 60 +++++ unstructured/ingest/interfaces.py | 133 +++++----- unstructured/ingest/pipeline/__init__.py | 20 ++ unstructured/ingest/pipeline/copy.py | 19 ++ unstructured/ingest/pipeline/doc_factory.py | 15 ++ unstructured/ingest/pipeline/initialize.py | 16 ++ unstructured/ingest/pipeline/interfaces.py | 204 ++++++++++++++++ unstructured/ingest/pipeline/partition.py | 43 ++++ unstructured/ingest/pipeline/pipeline.py | 69 ++++++ .../ingest/pipeline/reformat/chunking.py | 53 ++++ .../ingest/pipeline/reformat/embedding.py | 51 ++++ unstructured/ingest/pipeline/source.py | 27 ++ unstructured/ingest/pipeline/utils.py | 8 + unstructured/ingest/pipeline/write.py | 18 ++ unstructured/ingest/processor.py | 162 +++++------- unstructured/ingest/runner/__init__.py | 152 ++++++------ unstructured/ingest/runner/airtable.py | 94 +++---- unstructured/ingest/runner/azure.py | 110 ++++----- unstructured/ingest/runner/base_runner.py | 24 +- unstructured/ingest/runner/biomed.py | 115 ++++----- unstructured/ingest/runner/box.py | 78 +++--- unstructured/ingest/runner/confluence.py | 108 ++++---- unstructured/ingest/runner/delta_table.py | 104 ++++---- unstructured/ingest/runner/discord.py | 94 +++---- unstructured/ingest/runner/dropbox.py | 84 +++---- unstructured/ingest/runner/elasticsearch.py | 97 +++----- unstructured/ingest/runner/fsspec.py | 97 +++----- unstructured/ingest/runner/gcs.py | 78 +++--- unstructured/ingest/runner/github.py | 101 ++++---- unstructured/ingest/runner/gitlab.py | 101 ++++---- unstructured/ingest/runner/google_drive.py | 97 +++----- unstructured/ingest/runner/jira.py | 111 ++++----- unstructured/ingest/runner/local.py | 75 +++--- unstructured/ingest/runner/notion.py | 117 ++++----- unstructured/ingest/runner/onedrive.py | 109 ++++----- unstructured/ingest/runner/outlook.py | 106 ++++---- unstructured/ingest/runner/reddit.py | 105 ++++---- unstructured/ingest/runner/s3.py | 88 +++---- unstructured/ingest/runner/salesforce.py | 96 +++----- unstructured/ingest/runner/sharepoint.py | 22 +- unstructured/ingest/runner/slack.py | 98 +++----- unstructured/ingest/runner/wikipedia.py | 94 +++---- unstructured/ingest/runner/writers.py | 2 - 170 files changed, 2707 insertions(+), 2657 deletions(-) delete mode 100644 test_unstructured_ingest/unit/doc_processor/test_generalized.py rename unstructured/ingest/cli/{cmds => }/utils.py (59%) delete mode 100644 unstructured/ingest/doc_processor/__init__.py delete mode 100644 unstructured/ingest/doc_processor/generalized.py create mode 100644 unstructured/ingest/ingest_doc_json_mixin.py create mode 100644 unstructured/ingest/pipeline/__init__.py create mode 100644 unstructured/ingest/pipeline/copy.py create mode 100644 unstructured/ingest/pipeline/doc_factory.py create mode 100644 unstructured/ingest/pipeline/initialize.py create mode 100644 unstructured/ingest/pipeline/interfaces.py create mode 100644 unstructured/ingest/pipeline/partition.py create mode 100644 unstructured/ingest/pipeline/pipeline.py create mode 100644 unstructured/ingest/pipeline/reformat/chunking.py create mode 100644 unstructured/ingest/pipeline/reformat/embedding.py create mode 100644 unstructured/ingest/pipeline/source.py create mode 100644 unstructured/ingest/pipeline/utils.py create mode 100644 unstructured/ingest/pipeline/write.py diff --git a/.gitignore b/.gitignore index f7efde4599..af2dc5fe8b 100644 --- a/.gitignore +++ b/.gitignore @@ -137,6 +137,8 @@ dmypy.json # ingest outputs /structured-output +test_unstructured_ingest/workdir/ +test_unstructured_ingest/delta-table-dest/ # suggested ingest mirror directory /mirror diff --git a/CHANGELOG.md b/CHANGELOG.md index 68938ae3e5..96d106621d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,11 +7,11 @@ * **Adds data source properties to the Jira connector** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. * **Improve title detection in pptx documents** The default title textboxes on a pptx slide are now categorized as titles. * **Improve hierarchy detection in pptx documents** List items, and other slide text are properly nested under the slide title. This will enable better chunking of pptx documents. - +* **Refactor of the ingest cli workflow** The refactored approach uses a dynamically set pipeline with a snapshot along each step to save progress and accommodate continuation from a snapshot if an error occurs. This also allows the pipeline to dynamically assign any number of steps to modify the partitioned content before it gets written to a destination. ### Features * **Adds detection_origin field to metadata** Problem: Currently isn't an easy way to find out how an element was created. With this change that information is added. Importance: With this information the developers and users are now able to know how an element was created to make decisions on how to use it. In order tu use this feature -setting UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true is needed. +setting UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true is needed. ### Fixes @@ -79,7 +79,7 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text * **Fix badly initialized Formula** Problem: YoloX contain new types of elements, when loading a document that contain formulas a new element of that class -should be generated, however the Formula class inherits from Element instead of Text. After this change the element is correctly created with the correct class +should be generated, however the Formula class inherits from Element instead of Text. After this change the element is correctly created with the correct class allowing the document to be loaded. Fix: Change parent class for Formula to Text. Importance: Crucial to be able to load documents that contain formulas. ## 0.10.16 diff --git a/Dockerfile b/Dockerfile index 0bc9faebbc..3a6dbaeba1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -67,6 +67,6 @@ USER ${NB_USER} COPY example-docs example-docs COPY unstructured unstructured -RUN python3.10 -c "from unstructured.ingest.doc_processor.generalized import initialize; initialize()" +RUN python3.10 -c "from unstructured.ingest.pipeline.initialize import initialize; initialize()" CMD ["/bin/bash"] diff --git a/docs/requirements.txt b/docs/requirements.txt index 2373ecfb3e..d2834bd868 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -6,7 +6,7 @@ # alabaster==0.7.13 # via sphinx -babel==2.12.1 +babel==2.13.0 # via sphinx beautifulsoup4==4.12.2 # via @@ -116,7 +116,7 @@ sphinxcontrib-serializinghtml==1.1.5 # via # -r requirements/build.in # sphinx -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/base.txt b/requirements/base.txt index 3679dd89b2..3e68b38682 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -46,7 +46,7 @@ python-iso639==2023.6.15 # via -r requirements/base.in python-magic==0.4.27 # via -r requirements/base.in -regex==2023.8.8 +regex==2023.10.3 # via nltk requests==2.31.0 # via -r requirements/base.in @@ -62,7 +62,7 @@ typing-extensions==4.8.0 # via typing-inspect typing-inspect==0.9.0 # via dataclasses-json -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/constraints.in # requests diff --git a/requirements/build.txt b/requirements/build.txt index 2373ecfb3e..d2834bd868 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -6,7 +6,7 @@ # alabaster==0.7.13 # via sphinx -babel==2.12.1 +babel==2.13.0 # via sphinx beautifulsoup4==4.12.2 # via @@ -116,7 +116,7 @@ sphinxcontrib-serializinghtml==1.1.5 # via # -r requirements/build.in # sphinx -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/constraints.in b/requirements/constraints.in index 19a6775177..b51c00f3d7 100644 --- a/requirements/constraints.in +++ b/requirements/constraints.in @@ -44,3 +44,4 @@ anyio<4.0 # pinned in unstructured paddleocr opencv-python==4.8.0.76 opencv-contrib-python==4.8.0.76 +onnxruntime==1.15.1 diff --git a/requirements/dev.txt b/requirements/dev.txt index e83859d7dd..5950bd283e 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -13,18 +13,21 @@ appnope==0.1.3 # ipykernel # ipython argon2-cffi==23.1.0 - # via - # jupyter-server - # nbclassic - # notebook + # via jupyter-server argon2-cffi-bindings==21.2.0 # via argon2-cffi +arrow==1.3.0 + # via isoduration asttokens==2.4.0 # via stack-data +async-lru==2.0.4 + # via jupyterlab attrs==23.1.0 # via # jsonschema # referencing +babel==2.13.0 + # via jupyterlab-server backcall==0.2.0 # via ipython beautifulsoup4==4.12.2 @@ -33,10 +36,23 @@ beautifulsoup4==4.12.2 # nbconvert bleach==6.0.0 # via nbconvert +build==1.0.3 + # via pip-tools +certifi==2023.7.22 + # via + # -c requirements/base.txt + # -c requirements/constraints.in + # -c requirements/test.txt + # requests cffi==1.16.0 # via argon2-cffi-bindings cfgv==3.4.0 # via pre-commit +charset-normalizer==3.3.0 + # via + # -c requirements/base.txt + # -c requirements/test.txt + # requests click==8.1.7 # via # -c requirements/base.txt @@ -54,10 +70,6 @@ defusedxml==0.7.1 # via nbconvert distlib==0.3.7 # via virtualenv -entrypoints==0.4 - # via - # jupyter-client - # nbconvert exceptiongroup==1.1.3 # via # -c requirements/test.txt @@ -68,6 +80,8 @@ fastjsonschema==2.18.1 # via nbformat filelock==3.12.4 # via virtualenv +fqdn==1.5.1 + # via jsonschema identify==2.5.30 # via pre-commit idna==3.4 @@ -75,17 +89,26 @@ idna==3.4 # -c requirements/base.txt # -c requirements/test.txt # anyio + # jsonschema + # requests +importlib-metadata==6.8.0 + # via + # build + # jupyter-client + # jupyter-lsp + # jupyterlab + # jupyterlab-server + # nbconvert importlib-resources==6.1.0 # via # jsonschema # jsonschema-specifications - # notebook -ipykernel==6.11.0 + # jupyterlab +ipykernel==6.25.2 # via # jupyter # jupyter-console - # nbclassic - # notebook + # jupyterlab # qtconsole ipython==8.12.3 # via @@ -95,55 +118,74 @@ ipython==8.12.3 # ipywidgets # jupyter-console ipython-genutils==0.2.0 - # via - # jupyter-server - # nbclassic - # notebook - # qtconsole + # via qtconsole ipywidgets==8.1.1 # via jupyter -jedi==0.19.0 +isoduration==20.11.0 + # via jsonschema +jedi==0.19.1 # via ipython jinja2==3.1.2 # via # jupyter-server - # nbclassic + # jupyterlab + # jupyterlab-server # nbconvert - # notebook -jsonschema==4.19.1 - # via nbformat +json5==0.9.14 + # via jupyterlab-server +jsonpointer==2.4 + # via jsonschema +jsonschema[format-nongpl]==4.19.1 + # via + # jupyter-events + # jupyterlab-server + # nbformat jsonschema-specifications==2023.7.1 # via jsonschema jupyter==1.0.0 # via -r requirements/dev.in -jupyter-client==7.4.9 +jupyter-client==8.3.1 # via # ipykernel # jupyter-console # jupyter-server - # nbclassic # nbclient - # notebook # qtconsole -jupyter-console==6.4.4 +jupyter-console==6.6.3 # via jupyter jupyter-core==5.3.2 # via # -c requirements/constraints.in # ipykernel # jupyter-client + # jupyter-console # jupyter-server - # nbclassic + # jupyterlab + # nbclient # nbconvert # nbformat - # notebook # qtconsole -jupyter-server==1.13.1 +jupyter-events==0.7.0 + # via jupyter-server +jupyter-lsp==2.2.0 + # via jupyterlab +jupyter-server==2.7.3 # via - # nbclassic + # jupyter-lsp + # jupyterlab + # jupyterlab-server + # notebook # notebook-shim +jupyter-server-terminals==0.4.4 + # via jupyter-server +jupyterlab==4.0.6 + # via notebook jupyterlab-pygments==0.2.2 # via nbconvert +jupyterlab-server==2.25.0 + # via + # jupyterlab + # notebook jupyterlab-widgets==3.0.9 # via ipywidgets markupsafe==2.1.3 @@ -154,57 +196,56 @@ matplotlib-inline==0.1.6 # via # ipykernel # ipython -mistune==0.8.4 +mistune==3.0.2 # via nbconvert -nbclassic==1.0.0 - # via notebook -nbclient==0.5.13 +nbclient==0.8.0 # via nbconvert -nbconvert==6.4.5 +nbconvert==7.9.2 # via # jupyter # jupyter-server - # nbclassic - # notebook nbformat==5.9.2 # via # jupyter-server - # nbclassic # nbclient # nbconvert - # notebook nest-asyncio==1.5.8 - # via - # ipykernel - # jupyter-client - # nbclassic - # nbclient - # notebook + # via ipykernel nodeenv==1.8.0 # via pre-commit -notebook==6.5.6 +notebook==7.0.4 # via jupyter notebook-shim==0.2.3 # via - # nbclassic + # jupyterlab # notebook +overrides==7.4.0 + # via jupyter-server +packaging==23.2 + # via + # -c requirements/base.txt + # -c requirements/test.txt + # build + # ipykernel + # jupyter-server + # jupyterlab + # jupyterlab-server + # nbconvert + # qtconsole + # qtpy pandocfilters==1.5.0 # via nbconvert parso==0.8.3 # via jedi -pep517==0.13.0 - # via - # build - # pip-tools pexpect==4.8.0 # via ipython pickleshare==0.7.5 # via ipython -pip-tools==6.6.2 +pip-tools==7.3.0 # via -r requirements/dev.in pkgutil-resolve-name==1.3.10 # via jsonschema -platformdirs==3.10.0 +platformdirs==3.11.0 # via # -c requirements/test.txt # jupyter-core @@ -212,10 +253,7 @@ platformdirs==3.10.0 pre-commit==3.4.0 # via -r requirements/dev.in prometheus-client==0.17.1 - # via - # jupyter-server - # nbclassic - # notebook + # via jupyter-server prompt-toolkit==3.0.39 # via # ipython @@ -236,38 +274,57 @@ pygments==2.16.1 # jupyter-console # nbconvert # qtconsole +pyproject-hooks==1.0.0 + # via build python-dateutil==2.8.2 # via # -c requirements/test.txt + # arrow # jupyter-client +python-json-logger==2.0.7 + # via jupyter-events +pytz==2023.3.post1 + # via babel pyyaml==6.0.1 # via # -c requirements/test.txt + # jupyter-events # pre-commit -pyzmq==24.0.1 +pyzmq==25.1.1 # via + # ipykernel # jupyter-client + # jupyter-console # jupyter-server - # nbclassic - # notebook # qtconsole -qtconsole==5.2.2 +qtconsole==5.4.4 # via jupyter -qtpy==1.11.3 +qtpy==2.4.0 # via qtconsole referencing==0.30.2 # via # jsonschema # jsonschema-specifications -rpds-py==0.10.3 + # jupyter-events +requests==2.31.0 + # via + # -c requirements/base.txt + # -c requirements/test.txt + # jupyterlab-server +rfc3339-validator==0.1.4 + # via + # jsonschema + # jupyter-events +rfc3986-validator==0.1.1 + # via + # jsonschema + # jupyter-events +rpds-py==0.10.4 # via # jsonschema # referencing send2trash==1.8.2 - # via - # jupyter-server - # nbclassic - # notebook + # via jupyter-server six==1.16.0 # via # -c requirements/base.txt @@ -275,6 +332,7 @@ six==1.16.0 # asttokens # bleach # python-dateutil + # rfc3339-validator sniffio==1.3.0 # via anyio soupsieve==2.5 @@ -286,50 +344,67 @@ stack-data==0.6.3 terminado==0.17.1 # via # jupyter-server - # nbclassic - # notebook -testpath==0.6.0 + # jupyter-server-terminals +tinycss2==1.2.1 # via nbconvert tomli==2.0.1 # via # -c requirements/test.txt - # pep517 + # build + # jupyterlab # pip-tools + # pyproject-hooks tornado==6.3.3 # via # ipykernel # jupyter-client # jupyter-server - # nbclassic + # jupyterlab # notebook # terminado -traitlets==5.10.1 +traitlets==5.11.2 # via # comm # ipykernel # ipython # ipywidgets # jupyter-client + # jupyter-console # jupyter-core + # jupyter-events # jupyter-server + # jupyterlab # matplotlib-inline - # nbclassic # nbclient # nbconvert # nbformat - # notebook # qtconsole +types-python-dateutil==2.8.19.14 + # via arrow typing-extensions==4.8.0 # via # -c requirements/base.txt # -c requirements/test.txt + # async-lru # ipython +uri-template==1.3.0 + # via jsonschema +urllib3==1.26.17 + # via + # -c requirements/base.txt + # -c requirements/constraints.in + # -c requirements/test.txt + # requests virtualenv==20.24.5 # via pre-commit wcwidth==0.2.8 # via prompt-toolkit +webcolors==1.13 + # via jsonschema webencodings==0.5.1 - # via bleach + # via + # bleach + # tinycss2 websocket-client==1.6.3 # via jupyter-server wheel==0.41.2 @@ -339,7 +414,9 @@ wheel==0.41.2 widgetsnbextension==4.0.9 # via ipywidgets zipp==3.17.0 - # via importlib-resources + # via + # importlib-metadata + # importlib-resources # The following packages are considered to be unsafe in a requirements file: # pip diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 7a3fc605d0..8ed43d2387 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -6,7 +6,7 @@ # attrdict==2.0.1 # via unstructured-paddleocr -babel==2.12.1 +babel==2.13.0 # via flask-babel bce-python-sdk==0.8.90 # via visualdl @@ -35,7 +35,7 @@ cssutils==2.7.1 # via premailer cycler==0.12.0 # via matplotlib -cython==3.0.2 +cython==3.0.3 # via unstructured-paddleocr et-xmlfile==1.1.0 # via openpyxl @@ -43,7 +43,7 @@ flask==3.0.0 # via # flask-babel # visualdl -flask-babel==3.1.0 +flask-babel==4.0.0 # via visualdl fonttools==4.43.0 # via matplotlib @@ -53,7 +53,7 @@ idna==3.4 # via # -c requirements/base.txt # requests -imageio==2.31.4 +imageio==2.31.5 # via # imgaug # scikit-image @@ -211,7 +211,7 @@ tzdata==2023.3 # via pandas unstructured-paddleocr==2.6.1.3 # via -r requirements/extra-paddleocr.in -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 140eaa0a24..297b30255e 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -41,9 +41,11 @@ fsspec==2023.9.1 # via # -c requirements/constraints.in # huggingface-hub -huggingface-hub==0.17.3 + # torch +huggingface-hub==0.16.4 # via # timm + # tokenizers # transformers # unstructured-inference humanfriendly==10.0 @@ -91,8 +93,10 @@ omegaconf==2.3.0 # via effdet onnx==1.14.1 # via unstructured-inference -onnxruntime==1.16.0 - # via unstructured-inference +onnxruntime==1.15.1 + # via + # -c requirements/constraints.in + # unstructured-inference opencv-python==4.8.0.76 # via # -c requirements/constraints.in @@ -166,7 +170,7 @@ pyyaml==6.0.1 # transformers rapidfuzz==3.3.1 # via unstructured-inference -regex==2023.8.8 +regex==2023.10.3 # via # -c requirements/base.txt # transformers @@ -195,15 +199,15 @@ sympy==1.12 # torch timm==0.9.7 # via effdet -tokenizers==0.13.3 +tokenizers==0.14.0 # via transformers -torch==2.0.1 +torch==2.1.0 # via # effdet # layoutparser # timm # torchvision -torchvision==0.15.2 +torchvision==0.16.0 # via # effdet # layoutparser @@ -214,7 +218,7 @@ tqdm==4.66.1 # huggingface-hub # iopath # transformers -transformers==4.33.3 +transformers==4.34.0 # via unstructured-inference typing-extensions==4.8.0 # via @@ -231,7 +235,7 @@ unstructured-pytesseract==0.3.12 # via # -c requirements/constraints.in # -r requirements/extra-pdf-image.in -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 2fc6f0efb9..bdb0510555 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -26,8 +26,11 @@ fsspec==2023.9.1 # via # -c requirements/constraints.in # huggingface-hub -huggingface-hub==0.17.3 - # via transformers + # torch +huggingface-hub==0.16.4 + # via + # tokenizers + # transformers idna==3.4 # via # -c requirements/base.txt @@ -62,7 +65,7 @@ pyyaml==6.0.1 # via # huggingface-hub # transformers -regex==2023.8.8 +regex==2023.10.3 # via # -c requirements/base.txt # sacremoses @@ -87,9 +90,9 @@ six==1.16.0 # sacremoses sympy==1.12 # via torch -tokenizers==0.13.3 +tokenizers==0.14.0 # via transformers -torch==2.0.1 +torch==2.1.0 # via -r requirements/huggingface.in tqdm==4.66.1 # via @@ -97,14 +100,14 @@ tqdm==4.66.1 # huggingface-hub # sacremoses # transformers -transformers==4.33.3 +transformers==4.34.0 # via -r requirements/huggingface.in typing-extensions==4.8.0 # via # -c requirements/base.txt # huggingface-hub # torch -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-airtable.txt b/requirements/ingest-airtable.txt index 26744992b7..52467ffc78 100644 --- a/requirements/ingest-airtable.txt +++ b/requirements/ingest-airtable.txt @@ -34,7 +34,7 @@ typing-extensions==4.8.0 # -c requirements/base.txt # pyairtable # pydantic -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-azure-cognitive-search.txt b/requirements/ingest-azure-cognitive-search.txt index 763e3a14c1..ced625edab 100644 --- a/requirements/ingest-azure-cognitive-search.txt +++ b/requirements/ingest-azure-cognitive-search.txt @@ -50,7 +50,7 @@ typing-extensions==4.8.0 # -c requirements/base.txt # azure-core # azure-search-documents -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt index 55370e769f..42a635af49 100644 --- a/requirements/ingest-azure.txt +++ b/requirements/ingest-azure.txt @@ -94,7 +94,7 @@ typing-extensions==4.8.0 # -c requirements/base.txt # azure-core # azure-storage-blob -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-box.txt b/requirements/ingest-box.txt index 5d61bfc721..ee21de809d 100644 --- a/requirements/ingest-box.txt +++ b/requirements/ingest-box.txt @@ -49,7 +49,7 @@ six==1.16.0 # via # -c requirements/base.txt # python-dateutil -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-confluence.txt b/requirements/ingest-confluence.txt index 0a36bb3cfa..64218b1892 100644 --- a/requirements/ingest-confluence.txt +++ b/requirements/ingest-confluence.txt @@ -36,7 +36,7 @@ six==1.16.0 # via # -c requirements/base.txt # atlassian-python-api -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-dropbox.txt b/requirements/ingest-dropbox.txt index 56c77ff37e..e3ad8ce8a0 100644 --- a/requirements/ingest-dropbox.txt +++ b/requirements/ingest-dropbox.txt @@ -40,7 +40,7 @@ six==1.16.0 # stone stone==3.3.1 # via dropbox -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-elasticsearch.txt b/requirements/ingest-elasticsearch.txt index 356f8e3e2b..b4b7201ea4 100644 --- a/requirements/ingest-elasticsearch.txt +++ b/requirements/ingest-elasticsearch.txt @@ -15,7 +15,7 @@ elasticsearch==8.10.0 # via -r requirements/ingest-elasticsearch.in jq==1.6.0 # via -r requirements/ingest-elasticsearch.in -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-gcs.txt b/requirements/ingest-gcs.txt index 463dcbddff..d36bec2b46 100644 --- a/requirements/ingest-gcs.txt +++ b/requirements/ingest-gcs.txt @@ -103,7 +103,7 @@ soupsieve==2.5 # via # -c requirements/base.txt # beautifulsoup4 -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-github.txt b/requirements/ingest-github.txt index 1c649274c5..e0eae5b2a9 100644 --- a/requirements/ingest-github.txt +++ b/requirements/ingest-github.txt @@ -47,7 +47,7 @@ typing-extensions==4.8.0 # via # -c requirements/base.txt # pygithub -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-gitlab.txt b/requirements/ingest-gitlab.txt index 4d45eeda5c..02c9f868c9 100644 --- a/requirements/ingest-gitlab.txt +++ b/requirements/ingest-gitlab.txt @@ -26,7 +26,7 @@ requests==2.31.0 # requests-toolbelt requests-toolbelt==1.0.0 # via python-gitlab -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-google-drive.txt b/requirements/ingest-google-drive.txt index ace1ff7fa2..06b4fbbdcd 100644 --- a/requirements/ingest-google-drive.txt +++ b/requirements/ingest-google-drive.txt @@ -17,7 +17,7 @@ charset-normalizer==3.3.0 # requests google-api-core==2.12.0 # via google-api-python-client -google-api-python-client==2.101.0 +google-api-python-client==2.102.0 # via -r requirements/ingest-google-drive.in google-auth==2.23.2 # via @@ -59,7 +59,7 @@ rsa==4.9 # via google-auth uritemplate==4.1.1 # via google-api-python-client -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-jira.txt b/requirements/ingest-jira.txt index e53d3dc493..3681c6b68f 100644 --- a/requirements/ingest-jira.txt +++ b/requirements/ingest-jira.txt @@ -36,7 +36,7 @@ six==1.16.0 # via # -c requirements/base.txt # atlassian-python-api -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-onedrive.txt b/requirements/ingest-onedrive.txt index 99e838c70b..7dd0eaa8f9 100644 --- a/requirements/ingest-onedrive.txt +++ b/requirements/ingest-onedrive.txt @@ -52,7 +52,7 @@ soupsieve==2.5 # via # -c requirements/base.txt # beautifulsoup4 -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-openai.txt b/requirements/ingest-openai.txt index d0562f7058..64f81006a4 100644 --- a/requirements/ingest-openai.txt +++ b/requirements/ingest-openai.txt @@ -50,9 +50,9 @@ jsonpatch==1.33 # via langchain jsonpointer==2.4 # via jsonpatch -langchain==0.0.305 +langchain==0.0.309 # via -r requirements/ingest-openai.in -langsmith==0.0.41 +langsmith==0.0.42 # via langchain marshmallow==3.20.1 # via @@ -66,14 +66,11 @@ mypy-extensions==1.0.0 # via # -c requirements/base.txt # typing-inspect -numexpr==2.8.6 - # via langchain numpy==1.24.4 # via # -c requirements/base.txt # -c requirements/constraints.in # langchain - # numexpr openai==0.28.1 # via -r requirements/ingest-openai.in packaging==23.2 @@ -87,7 +84,7 @@ pydantic==1.10.13 # langsmith pyyaml==6.0.1 # via langchain -regex==2023.8.8 +regex==2023.10.3 # via # -c requirements/base.txt # tiktoken @@ -120,7 +117,7 @@ typing-inspect==0.9.0 # via # -c requirements/base.txt # dataclasses-json -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-outlook.txt b/requirements/ingest-outlook.txt index 9ca3f43a72..225004c7bb 100644 --- a/requirements/ingest-outlook.txt +++ b/requirements/ingest-outlook.txt @@ -42,7 +42,7 @@ requests==2.31.0 # -c requirements/base.txt # msal # office365-rest-python-client -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-reddit.txt b/requirements/ingest-reddit.txt index 7e19fdb9f4..1b2f69540d 100644 --- a/requirements/ingest-reddit.txt +++ b/requirements/ingest-reddit.txt @@ -19,7 +19,7 @@ idna==3.4 # requests praw==7.7.1 # via -r requirements/ingest-reddit.in -prawcore==2.3.0 +prawcore==2.4.0 # via praw requests==2.31.0 # via @@ -28,7 +28,7 @@ requests==2.31.0 # update-checker update-checker==0.18.0 # via praw -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-s3.txt b/requirements/ingest-s3.txt index b86dfe415b..2ccb068f3b 100644 --- a/requirements/ingest-s3.txt +++ b/requirements/ingest-s3.txt @@ -55,7 +55,7 @@ typing-extensions==4.8.0 # via # -c requirements/base.txt # aioitertools -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-salesforce.txt b/requirements/ingest-salesforce.txt index 817921dd71..92fc077a3b 100644 --- a/requirements/ingest-salesforce.txt +++ b/requirements/ingest-salesforce.txt @@ -33,7 +33,7 @@ more-itertools==10.1.0 # via simple-salesforce pendulum==2.1.2 # via simple-salesforce -platformdirs==3.10.0 +platformdirs==3.11.0 # via zeep pycparser==2.21 # via cffi @@ -64,7 +64,7 @@ six==1.16.0 # isodate # python-dateutil # requests-file -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-sharepoint.txt b/requirements/ingest-sharepoint.txt index 97cae3dd91..1c2c7f5f63 100644 --- a/requirements/ingest-sharepoint.txt +++ b/requirements/ingest-sharepoint.txt @@ -42,7 +42,7 @@ requests==2.31.0 # -c requirements/base.txt # msal # office365-rest-python-client -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-slack.txt b/requirements/ingest-slack.txt index 5af003f16e..43cb8f756c 100644 --- a/requirements/ingest-slack.txt +++ b/requirements/ingest-slack.txt @@ -4,5 +4,5 @@ # # pip-compile requirements/ingest-slack.in # -slack-sdk==3.22.0 +slack-sdk==3.23.0 # via -r requirements/ingest-slack.in diff --git a/requirements/ingest-wikipedia.txt b/requirements/ingest-wikipedia.txt index ec1add403a..bfefd071b6 100644 --- a/requirements/ingest-wikipedia.txt +++ b/requirements/ingest-wikipedia.txt @@ -29,7 +29,7 @@ soupsieve==2.5 # via # -c requirements/base.txt # beautifulsoup4 -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/test.txt b/requirements/test.txt index b4e48463db..7e94b99449 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -22,7 +22,7 @@ click==8.1.7 # -c requirements/base.txt # -r requirements/test.in # black -coverage[toml]==7.3.1 +coverage[toml]==7.3.2 # via # -r requirements/test.in # pytest-cov @@ -68,7 +68,7 @@ packaging==23.2 # pytest pathspec==0.11.2 # via black -platformdirs==3.10.0 +platformdirs==3.11.0 # via black pluggy==1.3.0 # via pytest @@ -97,7 +97,7 @@ requests==2.31.0 # via # -c requirements/base.txt # label-studio-sdk -ruff==0.0.291 +ruff==0.0.292 # via -r requirements/test.in six==1.16.0 # via @@ -125,7 +125,7 @@ typing-extensions==4.8.0 # black # mypy # pydantic -urllib3==1.26.16 +urllib3==1.26.17 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/test_unstructured_ingest/test-ingest-against-api.sh b/test_unstructured_ingest/test-ingest-against-api.sh index 3f7a43d807..d6d93f835c 100755 --- a/test_unstructured_ingest/test-ingest-against-api.sh +++ b/test_unstructured_ingest/test-ingest-against-api.sh @@ -10,11 +10,17 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=api-ingest-output OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +# shellcheck disable=SC1091 # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" +} +trap cleanup EXIT PYTHONPATH=. ./unstructured/ingest/main.py \ local \ @@ -27,6 +33,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --verbose \ --num-processes "$max_processes" \ --file-glob "*1p.txt" \ - --input-path example-docs + --input-path example-docs \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-airtable-diff.sh b/test_unstructured_ingest/test-ingest-airtable-diff.sh index 8c69a31146..c7565a58f3 100755 --- a/test_unstructured_ingest/test-ingest-airtable-diff.sh +++ b/test_unstructured_ingest/test-ingest-airtable-diff.sh @@ -9,6 +9,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=airtable-diff OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME CI=${CI:-"false"} max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} @@ -17,6 +18,7 @@ max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -40,6 +42,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" + --output-dir "$OUTPUT_DIR" \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-airtable-large.sh b/test_unstructured_ingest/test-ingest-airtable-large.sh index b87e728187..89cae31c67 100755 --- a/test_unstructured_ingest/test-ingest-airtable-large.sh +++ b/test_unstructured_ingest/test-ingest-airtable-large.sh @@ -10,6 +10,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=airtable-large OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -18,6 +19,7 @@ CI=${CI:-"false"} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -43,7 +45,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" + --output-dir "$OUTPUT_DIR" \ + --work-dir "$WORK_DIR" # We are expecting fifteen directories: fourteen bases and the parent directory diff --git a/test_unstructured_ingest/test-ingest-azure-cognitive-search.sh b/test_unstructured_ingest/test-ingest-azure-cognitive-search.sh index 18f638e375..5669396c9b 100755 --- a/test_unstructured_ingest/test-ingest-azure-cognitive-search.sh +++ b/test_unstructured_ingest/test-ingest-azure-cognitive-search.sh @@ -6,6 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=s3-azure-cog-search-dest OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DESTINATION_INDEX="utic-test-ingest-fixtures-output-$(date +%s)" # The vector configs on the schema currently only exist on versions: @@ -16,8 +17,10 @@ if [ -z "$AZURE_SEARCH_ENDPOINT" ] && [ -z "$AZURE_SEARCH_API_KEY" ]; then echo "Skipping Azure Cognitive Search ingest test because neither AZURE_SEARCH_ENDPOINT nor AZURE_SEARCH_API_KEY env vars are set." exit 0 fi - +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh function cleanup { + # Index cleanup response_code=$(curl -s -o /dev/null -w "%{http_code}" \ "https://utic-test-ingest-fixtures.search.windows.net/indexes/$DESTINATION_INDEX?api-version=$API_VERSION" \ --header "api-key: $AZURE_SEARCH_API_KEY" \ @@ -31,6 +34,13 @@ function cleanup { else echo "Index $DESTINATION_INDEX does not exist, nothing to delete" fi + + # Local file cleanup + cleanup_dir "$WORK_DIR" + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi } trap cleanup EXIT @@ -62,6 +72,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --verbose \ --remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \ --anonymous \ + --work-dir "$WORK_DIR" \ azure-cognitive-search \ --key "$AZURE_SEARCH_API_KEY" \ --endpoint "$AZURE_SEARCH_ENDPOINT" \ diff --git a/test_unstructured_ingest/test-ingest-azure.sh b/test_unstructured_ingest/test-ingest-azure.sh index 9fdb9dd5e5..378f021f63 100755 --- a/test_unstructured_ingest/test-ingest-azure.sh +++ b/test_unstructured_ingest/test-ingest-azure.sh @@ -6,6 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=azure OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} @@ -13,6 +14,7 @@ max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" } trap cleanup EXIT @@ -27,6 +29,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --output-dir "$OUTPUT_DIR" \ --verbose \ --account-name azureunstructured1 \ - --remote-url abfs://container1/ + --remote-url abfs://container1/ \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-biomed-api.sh b/test_unstructured_ingest/test-ingest-biomed-api.sh index bf0de6998f..e12d6fe525 100755 --- a/test_unstructured_ingest/test-ingest-biomed-api.sh +++ b/test_unstructured_ingest/test-ingest-biomed-api.sh @@ -7,6 +7,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=biomed-api OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} @@ -14,6 +15,7 @@ max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" } trap cleanup EXIT @@ -35,5 +37,6 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --decay .3 \ --max-request-time 30 \ --max-retries 5 \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-biomed-path.sh b/test_unstructured_ingest/test-ingest-biomed-path.sh index b726364ef3..a7c8c7b128 100755 --- a/test_unstructured_ingest/test-ingest-biomed-path.sh +++ b/test_unstructured_ingest/test-ingest-biomed-path.sh @@ -7,6 +7,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=biomed-path OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} @@ -14,6 +15,7 @@ max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" } trap cleanup EXIT @@ -33,5 +35,6 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --max-request-time 30 \ --max-retries 5 \ --path "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-box.sh b/test_unstructured_ingest/test-ingest-box.sh index 43a8ad38ff..5f89710857 100755 --- a/test_unstructured_ingest/test-ingest-box.sh +++ b/test_unstructured_ingest/test-ingest-box.sh @@ -9,6 +9,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=box OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -17,6 +18,7 @@ CI=${CI:-"false"} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -45,6 +47,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --preserve-downloads \ --recursive \ --reprocess \ - --verbose + --verbose \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-confluence-diff.sh b/test_unstructured_ingest/test-ingest-confluence-diff.sh index c9c0c21483..35619421d6 100755 --- a/test_unstructured_ingest/test-ingest-confluence-diff.sh +++ b/test_unstructured_ingest/test-ingest-confluence-diff.sh @@ -8,6 +8,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=confluence-diff OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -16,6 +17,7 @@ CI=${CI:-"false"} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -39,6 +41,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --url https://unstructured-ingest-test.atlassian.net \ --user-email "$CONFLUENCE_USER_EMAIL" \ --api-token "$CONFLUENCE_API_TOKEN" \ - --spaces testteamsp,MFS + --spaces testteamsp,MFS \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-confluence-large.sh b/test_unstructured_ingest/test-ingest-confluence-large.sh index c1196bdd3d..6f95ee90e4 100755 --- a/test_unstructured_ingest/test-ingest-confluence-large.sh +++ b/test_unstructured_ingest/test-ingest-confluence-large.sh @@ -10,6 +10,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=confluence-large OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -18,6 +19,7 @@ CI=${CI:-"false"} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -48,6 +50,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --max-num-of-spaces 10 \ --spaces testteamsp1 \ --max-num-of-docs-from-each-space 250 \ + --work-dir "$WORK_DIR" OUTPUT_SUBFOLDER_NAME=testteamsp1 diff --git a/test_unstructured_ingest/test-ingest-delta-table.sh b/test_unstructured_ingest/test-ingest-delta-table.sh index d4c79a8f0d..c4be1a25f6 100755 --- a/test_unstructured_ingest/test-ingest-delta-table.sh +++ b/test_unstructured_ingest/test-ingest-delta-table.sh @@ -6,6 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=delta-table OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} @@ -22,6 +23,7 @@ source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$DESTINATION_TABLE" cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -39,6 +41,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --storage_options "AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY" \ --preserve-downloads \ --verbose \ + --work-dir "$WORK_DIR" \ delta-table \ --write-column json_data \ --table-uri "$DESTINATION_TABLE" diff --git a/test_unstructured_ingest/test-ingest-discord.sh b/test_unstructured_ingest/test-ingest-discord.sh index 7aedb2b352..b32b7df359 100755 --- a/test_unstructured_ingest/test-ingest-discord.sh +++ b/test_unstructured_ingest/test-ingest-discord.sh @@ -6,6 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=discord OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -14,6 +15,7 @@ CI=${CI:-"false"} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -36,5 +38,6 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --verbose \ --channels 1099442333440802930,1099601456321003600 \ --token "$DISCORD_TOKEN" \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-dropbox.sh b/test_unstructured_ingest/test-ingest-dropbox.sh index b591f0cdd8..c6052e8ea3 100755 --- a/test_unstructured_ingest/test-ingest-dropbox.sh +++ b/test_unstructured_ingest/test-ingest-dropbox.sh @@ -6,6 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=dropbox OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -14,6 +15,7 @@ CI=${CI:-"false"} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -41,7 +43,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --verbose \ --token "$DROPBOX_ACCESS_TOKEN" \ --recursive \ - --remote-url "dropbox:// /" + --remote-url "dropbox:// /" \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-elasticsearch.sh b/test_unstructured_ingest/test-ingest-elasticsearch.sh index 7b181f90ba..a983c2781f 100755 --- a/test_unstructured_ingest/test-ingest-elasticsearch.sh +++ b/test_unstructured_ingest/test-ingest-elasticsearch.sh @@ -7,6 +7,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 echo "SCRIPT_DIR: $SCRIPT_DIR" OUTPUT_FOLDER_NAME=elasticsearch OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -20,6 +21,7 @@ function cleanup() { docker-compose -f scripts/elasticsearch-test-helpers/docker-compose.yaml down --remove-orphans -v cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -42,6 +44,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --verbose \ --index-name movies \ --url http://localhost:9200 \ - --jq-query '{ethnicity, director, plot}' + --jq-query '{ethnicity, director, plot}' \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-gcs.sh b/test_unstructured_ingest/test-ingest-gcs.sh index dd43710941..95ba89e440 100755 --- a/test_unstructured_ingest/test-ingest-gcs.sh +++ b/test_unstructured_ingest/test-ingest-gcs.sh @@ -6,6 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=gcs OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -14,6 +15,7 @@ CI=${CI:-"false"} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -40,7 +42,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --verbose \ --token "$GCP_INGEST_SERVICE_KEY_FILE" \ --recursive \ - --remote-url gs://utic-test-ingest-fixtures/ + --remote-url gs://utic-test-ingest-fixtures/ \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-github.sh b/test_unstructured_ingest/test-ingest-github.sh index 4061bea956..31d11995ef 100755 --- a/test_unstructured_ingest/test-ingest-github.sh +++ b/test_unstructured_ingest/test-ingest-github.sh @@ -6,6 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=github OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -14,6 +15,7 @@ CI=${CI:-"false"} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -46,6 +48,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --verbose \ --url dcneiner/Downloadify \ --git-file-glob '*.html,*.txt' \ + --work-dir "$WORK_DIR" \ $ACCESS_TOKEN_FLAGS "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-gitlab.sh b/test_unstructured_ingest/test-ingest-gitlab.sh index 1a9031c7a7..9f5003f682 100755 --- a/test_unstructured_ingest/test-ingest-gitlab.sh +++ b/test_unstructured_ingest/test-ingest-gitlab.sh @@ -6,6 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=gitlab OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -14,6 +15,7 @@ CI=${CI:-"false"} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -32,6 +34,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --verbose \ --git-branch 'v0.0.7' \ --git-file-glob '*.md,*.txt' \ - --url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab + --url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-num-files-output.sh 2 $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-google-drive.sh b/test_unstructured_ingest/test-ingest-google-drive.sh index 218a5cfe0a..18560a1719 100755 --- a/test_unstructured_ingest/test-ingest-google-drive.sh +++ b/test_unstructured_ingest/test-ingest-google-drive.sh @@ -6,6 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=google-drive OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -14,6 +15,7 @@ CI=${CI:-"false"} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -41,7 +43,8 @@ PYTHONPATH=. unstructured/ingest/main.py \ --output-dir "$OUTPUT_DIR" \ --verbose \ --drive-id 1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr \ - --service-account-key "$GCP_INGEST_SERVICE_KEY_FILE" + --service-account-key "$GCP_INGEST_SERVICE_KEY_FILE" \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-jira.sh b/test_unstructured_ingest/test-ingest-jira.sh index 173fc4f94b..f3646d9af3 100755 --- a/test_unstructured_ingest/test-ingest-jira.sh +++ b/test_unstructured_ingest/test-ingest-jira.sh @@ -7,6 +7,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=jira-diff OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -15,6 +16,7 @@ CI=${CI:-"false"} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -60,7 +62,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --api-token "$JIRA_INGEST_API_TOKEN" \ --projects "JCTP3" \ --boards "1" \ - --issues "JCTP2-4,JCTP2-7,JCTP2-8,10012,JCTP2-11" + --issues "JCTP2-4,JCTP2-7,JCTP2-8,10012,JCTP2-11" \ + --work-dir "$WORK_DIR" diff --git a/test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh b/test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh index 6442eec0b3..2735e10ed6 100755 --- a/test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh +++ b/test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh @@ -6,11 +6,16 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=local-single-file-with-encoding OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" +} +trap cleanup EXIT PYTHONPATH=. ./unstructured/ingest/main.py \ local \ @@ -20,7 +25,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --encoding cp1252 \ --verbose \ --reprocess \ - --input-path example-docs/fake-html-cp1252.html + --input-path example-docs/fake-html-cp1252.html \ + --work-dir "$WORK_DIR" set +e diff --git a/test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh b/test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh index 9d15a0e55c..825319c4ad 100755 --- a/test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh +++ b/test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh @@ -6,11 +6,16 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=local-single-file-with-pdf-infer-table-structure OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" +} +trap cleanup EXIT PYTHONPATH=. ./unstructured/ingest/main.py \ local \ @@ -21,7 +26,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --strategy hi_res \ --verbose \ --reprocess \ - --input-path example-docs/layout-parser-paper.pdf + --input-path example-docs/layout-parser-paper.pdf \ + --work-dir "$WORK_DIR" set +e diff --git a/test_unstructured_ingest/test-ingest-local-single-file.sh b/test_unstructured_ingest/test-ingest-local-single-file.sh index 24954c1821..bf6ad5f416 100755 --- a/test_unstructured_ingest/test-ingest-local-single-file.sh +++ b/test_unstructured_ingest/test-ingest-local-single-file.sh @@ -6,11 +6,16 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=local-single-file OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" +} +trap cleanup EXIT PYTHONPATH=. ./unstructured/ingest/main.py \ local \ @@ -21,7 +26,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --strategy ocr_only \ --verbose \ --reprocess \ - --input-path example-docs/english-and-korean.png + --input-path example-docs/english-and-korean.png \ + --work-dir "$WORK_DIR" set +e diff --git a/test_unstructured_ingest/test-ingest-local.sh b/test_unstructured_ingest/test-ingest-local.sh index 0e1b3856c1..1754ec12b3 100755 --- a/test_unstructured_ingest/test-ingest-local.sh +++ b/test_unstructured_ingest/test-ingest-local.sh @@ -6,11 +6,16 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=local OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" +} +trap cleanup EXIT PYTHONPATH=. ./unstructured/ingest/main.py \ local \ @@ -21,6 +26,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --output-dir "$OUTPUT_DIR" \ --verbose \ --file-glob "*.html" \ - --input-path example-docs + --input-path example-docs \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-notion.sh b/test_unstructured_ingest/test-ingest-notion.sh index 2a83a47bb3..c87430320b 100755 --- a/test_unstructured_ingest/test-ingest-notion.sh +++ b/test_unstructured_ingest/test-ingest-notion.sh @@ -6,6 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=notion OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -14,6 +15,7 @@ CI=${CI:-"false"} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -34,7 +36,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --database-ids "122b2c22996b435b9de2ee0e9d2b04bc" \ --num-processes "$max_processes" \ --recursive \ - --verbose + --verbose \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-onedrive.sh b/test_unstructured_ingest/test-ingest-onedrive.sh index 290643815d..0cad6257fc 100755 --- a/test_unstructured_ingest/test-ingest-onedrive.sh +++ b/test_unstructured_ingest/test-ingest-onedrive.sh @@ -6,6 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=onedrive OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -14,6 +15,7 @@ CI=${CI:-"false"} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -41,5 +43,6 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --user-pname "$MS_USER_PNAME" \ --path '/utic-test-ingest-fixtures' \ --recursive \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-outlook.sh b/test_unstructured_ingest/test-ingest-outlook.sh index 384287e7ea..82b76bdcc5 100755 --- a/test_unstructured_ingest/test-ingest-outlook.sh +++ b/test_unstructured_ingest/test-ingest-outlook.sh @@ -6,6 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=outlook OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -14,6 +15,7 @@ CI=${CI:-"false"} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -39,8 +41,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --tenant "$MS_TENANT_ID" \ --user-email "$MS_USER_EMAIL" \ --outlook-folders IntegrationTest \ - --recursive - + --recursive \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh b/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh index 96acee7bd3..470e15af9e 100755 --- a/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh +++ b/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh @@ -7,6 +7,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=pdf-fast-reprocess OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME INPUT_PATH=$SCRIPT_DIR/download max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -15,6 +16,7 @@ CI=${CI:-"false"} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$INPUT_PATH" fi @@ -34,7 +36,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --verbose \ --file-glob "*.pdf" \ --input-path "$INPUT_PATH" \ - --recursive + --recursive \ + --work-dir "$WORK_DIR" diff --git a/test_unstructured_ingest/test-ingest-s3-minio.sh b/test_unstructured_ingest/test-ingest-s3-minio.sh index 000c28e28b..0164cb9dde 100755 --- a/test_unstructured_ingest/test-ingest-s3-minio.sh +++ b/test_unstructured_ingest/test-ingest-s3-minio.sh @@ -7,6 +7,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=s3-minio OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} secret_key=minioadmin @@ -21,6 +22,7 @@ function cleanup() { docker-compose -f scripts/minio-test-helpers/docker-compose.yaml down --remove-orphans -v cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" } trap cleanup EXIT @@ -40,7 +42,8 @@ AWS_SECRET_ACCESS_KEY=$secret_key AWS_ACCESS_KEY_ID=$access_key PYTHONPATH=. ./u --output-dir "$OUTPUT_DIR" \ --verbose \ --remote-url s3://utic-dev-tech-fixtures/ \ - --endpoint-url http://localhost:9000 + --endpoint-url http://localhost:9000 \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-s3.sh b/test_unstructured_ingest/test-ingest-s3.sh index 214a70ab71..e150e366dc 100755 --- a/test_unstructured_ingest/test-ingest-s3.sh +++ b/test_unstructured_ingest/test-ingest-s3.sh @@ -7,6 +7,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=s3 OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} @@ -14,6 +15,7 @@ max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" } trap cleanup EXIT @@ -30,7 +32,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --output-dir "$OUTPUT_DIR" \ --verbose \ --remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \ - --anonymous + --anonymous \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-salesforce.sh b/test_unstructured_ingest/test-ingest-salesforce.sh index 04f686e1d9..bfaec6e647 100755 --- a/test_unstructured_ingest/test-ingest-salesforce.sh +++ b/test_unstructured_ingest/test-ingest-salesforce.sh @@ -9,6 +9,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=salesforce OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -17,12 +18,18 @@ CI=${CI:-"false"} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi } trap cleanup EXIT +if [ -z "$SALESFORCE_USERNAME" ] || [ -z "$SALESFORCE_CONSUMER_KEY" ]; then + echo "Skipping Salesforce ingest test because SALESFORCE_USERNAME and SALESFORCE_CONSUMER_KEY env vars not set" + exit 0 +fi + if [ -z "$SALESFORCE_PRIVATE_KEY" ] && [ -z "$SALESFORCE_PRIVATE_KEY_PATH" ]; then echo "Skipping Salesforce ingest test because neither SALESFORCE_PRIVATE_KEY nor SALESFORCE_PRIVATE_KEY_PATH env vars are set." exit 0 @@ -47,6 +54,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --recursive \ --reprocess \ --output-dir "$OUTPUT_DIR" \ - --verbose + --verbose \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh b/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh index 738848e008..aa3f8b91fe 100755 --- a/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh +++ b/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh @@ -6,6 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=sharepoint-azure-dest OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DESTINATION_INDEX="utic-test-ingest-fixtures-output-$(date +%s)" # The vector configs on the schema currently only exist on versions: @@ -47,6 +48,7 @@ function cleanup { fi cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -88,6 +90,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --embedding-api-key "$OPENAI_API_KEY" \ --chunk-elements \ --chunk-multipage-sections \ + --work-dir "$WORK_DIR" \ azure-cognitive-search \ --key "$AZURE_SEARCH_API_KEY" \ --endpoint "$AZURE_SEARCH_ENDPOINT" \ diff --git a/test_unstructured_ingest/test-ingest-sharepoint.sh b/test_unstructured_ingest/test-ingest-sharepoint.sh index 8eefa87a60..8aea377c41 100755 --- a/test_unstructured_ingest/test-ingest-sharepoint.sh +++ b/test_unstructured_ingest/test-ingest-sharepoint.sh @@ -6,6 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=Sharepoint OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -14,6 +15,7 @@ CI=${CI:-"false"} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -40,5 +42,6 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --site "$SHAREPOINT_SITE" \ --path "Shared Documents" \ --recursive \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-slack.sh b/test_unstructured_ingest/test-ingest-slack.sh index ff51d63692..ecc96994d8 100755 --- a/test_unstructured_ingest/test-ingest-slack.sh +++ b/test_unstructured_ingest/test-ingest-slack.sh @@ -6,6 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=slack OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -14,6 +15,7 @@ CI=${CI:-"false"} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -38,6 +40,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --channels C052BGT7718 \ --token "${SLACK_TOKEN}" \ --start-date 2023-04-01 \ - --end-date 2023-04-08T12:00:00-08:00 + --end-date 2023-04-08T12:00:00-08:00 \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-wikipedia.sh b/test_unstructured_ingest/test-ingest-wikipedia.sh index 1dc5e428b4..0f81060e88 100755 --- a/test_unstructured_ingest/test-ingest-wikipedia.sh +++ b/test_unstructured_ingest/test-ingest-wikipedia.sh @@ -6,6 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=wikipedia OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -14,6 +15,7 @@ CI=${CI:-"false"} source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi @@ -29,6 +31,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --preserve-downloads \ --output-dir "$OUTPUT_DIR" \ --verbose \ - --page-title "Open Source Software" + --page-title "Open Source Software" \ + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-num-files-output.sh 3 $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest.sh b/test_unstructured_ingest/test-ingest.sh index 8c2dffc977..9ac6b38f2e 100755 --- a/test_unstructured_ingest/test-ingest.sh +++ b/test_unstructured_ingest/test-ingest.sh @@ -16,6 +16,7 @@ scripts=( 'test-ingest-biomed-path.sh' ## NOTE(yuming): The following test should be put after any tests with --preserve-downloads option 'test-ingest-pdf-fast-reprocess.sh' +'test-ingest-salesforce.sh' 'test-ingest-box.sh' 'test-ingest-discord.sh' 'test-ingest-dropbox.sh' @@ -40,7 +41,6 @@ scripts=( 'test-ingest-local-single-file-with-pdf-infer-table-structure.sh' 'test-ingest-notion.sh' 'test-ingest-delta-table.sh' -'test-ingest-salesforce.sh' 'test-ingest-jira.sh' 'test-ingest-sharepoint.sh' ) diff --git a/test_unstructured_ingest/unit/doc_processor/test_generalized.py b/test_unstructured_ingest/unit/doc_processor/test_generalized.py deleted file mode 100644 index 41c3fc5070..0000000000 --- a/test_unstructured_ingest/unit/doc_processor/test_generalized.py +++ /dev/null @@ -1,52 +0,0 @@ -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseIngestDoc, IngestDocSessionHandleMixin - - -@dataclass -class IngestDocWithSessionHandle(IngestDocSessionHandleMixin, BaseIngestDoc): - pass - - -def test_process_document_with_session_handle(mocker): - """Test that the process_document function calls the doc_processor_fn with the correct - arguments, assigns the session handle, and returns the correct results.""" - mock_doc = mocker.MagicMock(spec=(IngestDocWithSessionHandle)) - mocker.patch( - "unstructured.ingest.connector.registry.create_ingest_doc_from_json", - return_value=mock_doc, - ) - mock_session_handle = mocker.MagicMock() - mocker.patch( - "unstructured.ingest.doc_processor.generalized.session_handle", - mock_session_handle, - ) - - # import here to account for the patching above - from unstructured.ingest.doc_processor.generalized import process_document - - result = process_document(mocker.MagicMock()) - - mock_doc.get_file.assert_called_once_with() - mock_doc.write_result.assert_called_with() - mock_doc.cleanup_file.assert_called_once_with() - assert result == mock_doc.process_file.return_value - assert mock_doc.session_handle == mock_session_handle - - -def test_process_document_no_session_handle(mocker): - """Test that the process_document function calls does not assign session handle the IngestDoc - does not have the session handle mixin.""" - mock_doc = mocker.MagicMock(spec=(BaseIngestDoc)) - mocker.patch( - "unstructured.ingest.connector.registry.create_ingest_doc_from_json", - return_value=mock_doc, - ) - mocker.patch("unstructured.ingest.doc_processor.generalized.session_handle", mocker.MagicMock()) - - # import here to account for the patching above - from unstructured.ingest.doc_processor.generalized import process_document - - process_document(mock_doc) - - assert not hasattr(mock_doc, "session_handle") diff --git a/test_unstructured_ingest/unit/test_interfaces.py b/test_unstructured_ingest/unit/test_interfaces.py index 6a7d3c162d..8d8c0cbfbd 100644 --- a/test_unstructured_ingest/unit/test_interfaces.py +++ b/test_unstructured_ingest/unit/test_interfaces.py @@ -10,6 +10,7 @@ BaseConnectorConfig, BaseIngestDoc, PartitionConfig, + ProcessorConfig, ReadConfig, ) from unstructured.partition.auto import partition @@ -116,10 +117,11 @@ def test_partition_file(): test_ingest_doc = TestIngestDoc( connector_config=TEST_CONFIG, read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR), - partition_config=PartitionConfig(output_dir=TEST_OUTPUT_DIR), + processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR), ) test_ingest_doc._date_processed = TEST_DATE_PROCESSSED - isd_elems = test_ingest_doc.partition_file() + isd_elems_raw = test_ingest_doc.partition_file(partition_config=PartitionConfig()) + isd_elems = convert_to_dict(isd_elems_raw) assert len(isd_elems) expected_keys = { "element_id", @@ -162,9 +164,10 @@ def test_process_file_fields_include_default(mocker, partition_test_results): test_ingest_doc = TestIngestDoc( connector_config=TEST_CONFIG, read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR), - partition_config=PartitionConfig(output_dir=TEST_OUTPUT_DIR), + processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR), ) - isd_elems = test_ingest_doc.process_file() + isd_elems_raw = test_ingest_doc.partition_file(partition_config=PartitionConfig()) + isd_elems = convert_to_dict(isd_elems_raw) assert len(isd_elems) assert mock_partition.call_count == 1 for elem in isd_elems: @@ -191,15 +194,15 @@ def test_process_file_metadata_includes_filename_and_filetype( "unstructured.ingest.interfaces.partition", return_value=partition_test_results, ) + partition_config = PartitionConfig( + metadata_include=["filename", "filetype"], + ) test_ingest_doc = TestIngestDoc( connector_config=TEST_CONFIG, read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR), - partition_config=PartitionConfig( - output_dir=TEST_OUTPUT_DIR, - metadata_include=["filename", "filetype"], - ), + processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR), ) - isd_elems = test_ingest_doc.process_file() + isd_elems = test_ingest_doc.process_file(partition_config=partition_config) assert len(isd_elems) for elem in isd_elems: # Parent IDs are non-deterministic - remove them from the test @@ -215,15 +218,17 @@ def test_process_file_metadata_exclude_filename_pagenum(mocker, partition_test_r "unstructured.ingest.interfaces.partition", return_value=partition_test_results, ) + partition_config = PartitionConfig( + metadata_exclude=["filename", "page_number"], + ) test_ingest_doc = TestIngestDoc( connector_config=TEST_CONFIG, read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR), - partition_config=PartitionConfig( + processor_config=ProcessorConfig( output_dir=TEST_OUTPUT_DIR, - metadata_exclude=["filename", "page_number"], ), ) - isd_elems = test_ingest_doc.process_file() + isd_elems = test_ingest_doc.process_file(partition_config=partition_config) assert len(isd_elems) for elem in isd_elems: assert "filename" not in elem["metadata"] @@ -235,16 +240,18 @@ def test_process_file_flatten_metadata(mocker, partition_test_results): "unstructured.ingest.interfaces.partition", return_value=partition_test_results, ) + partition_config = PartitionConfig( + metadata_include=["filename", "data_source"], + flatten_metadata=True, + ) test_ingest_doc = TestIngestDoc( connector_config=TEST_CONFIG, read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR), - partition_config=PartitionConfig( + processor_config=ProcessorConfig( output_dir=TEST_OUTPUT_DIR, - metadata_include=["filename", "data_source"], - flatten_metadata=True, ), ) - isd_elems = test_ingest_doc.process_file() + isd_elems = test_ingest_doc.process_file(partition_config=partition_config) expected_keys = {"element_id", "text", "type", "filename", "data_source"} for elem in isd_elems: assert expected_keys == set(elem.keys()) diff --git a/test_unstructured_ingest/unit/test_paths.py b/test_unstructured_ingest/unit/test_paths.py index 0399226246..e3c6e36d2a 100644 --- a/test_unstructured_ingest/unit/test_paths.py +++ b/test_unstructured_ingest/unit/test_paths.py @@ -31,7 +31,7 @@ def test_dropbox_root_succeeds(): dbox = DropboxIngestDoc( connector_config=FakeConfigDropboxRoot, read_config=FakeConfigDropboxRoot, - partition_config=FakeConfigDropboxRoot, + processor_config=FakeConfigDropboxRoot, remote_file_path="/fake_file.txt", ) output_filename = dbox._output_filename @@ -49,7 +49,7 @@ def test_dropbox_root_succeeds2(): dbox = DropboxIngestDoc( connector_config=FakeConfigDropboxRoot, read_config=FakeConfigDropboxRoot, - partition_config=FakeConfigDropboxRoot, + processor_config=FakeConfigDropboxRoot, remote_file_path="fake_file.txt", ) output_filename = dbox._output_filename @@ -67,7 +67,7 @@ def test_dropbox_folder_succeeds(): dbox = DropboxIngestDoc( connector_config=FakeConfigFolder, read_config=FakeConfigFolder, - partition_config=FakeConfigFolder, + processor_config=FakeConfigFolder, remote_file_path="fake_file2.txt", ) output_filename = dbox._output_filename @@ -83,7 +83,7 @@ def test_dropbox_folder_fails(): dbox = DropboxIngestDoc( connector_config=FakeConfigFolder, read_config=FakeConfigFolder, - partition_config=FakeConfigFolder, + processor_config=FakeConfigFolder, remote_file_path="/fake_file2.txt", ) output_filename = dbox._output_filename @@ -101,7 +101,7 @@ def test_fsspec_folder_succeeds(): dbox = FsspecIngestDoc( connector_config=FakeConfigFolder, read_config=FakeConfigFolder, - partition_config=FakeConfigFolder, + processor_config=FakeConfigFolder, remote_file_path="fake_file2.txt", ) output_filename = dbox._output_filename @@ -117,7 +117,7 @@ def test_fsspec_folder_fails(): fstest = FsspecIngestDoc( connector_config=FakeConfigFolder, read_config=FakeConfigFolder, - partition_config=FakeConfigFolder, + processor_config=FakeConfigFolder, remote_file_path="/fake_file2.txt", ) output_filename = fstest._output_filename diff --git a/unstructured/ingest/cli/cmds/airtable.py b/unstructured/ingest/cli/cmds/airtable.py index a0f704a931..36d66c9948 100644 --- a/unstructured/ingest/cli/cmds/airtable.py +++ b/unstructured/ingest/cli/cmds/airtable.py @@ -4,18 +4,16 @@ import click -from unstructured.ingest.cli.cmds.utils import Group, conform_click_options from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import airtable as airtable_fn +from unstructured.ingest.runner import AirtableRunner @dataclass @@ -82,12 +80,11 @@ def airtable_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - AirtableCliConfig.from_dict(options) - airtable_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=[AirtableCliConfig]) + runner = AirtableRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -95,10 +92,5 @@ def airtable_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = airtable_source - AirtableCliConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[AirtableCliConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/azure.py b/unstructured/ingest/cli/cmds/azure.py index 6db5b57849..5a54bc279e 100644 --- a/unstructured/ingest/cli/cmds/azure.py +++ b/unstructured/ingest/cli/cmds/azure.py @@ -4,20 +4,18 @@ import click -from unstructured.ingest.cli.cmds.utils import Group, conform_click_options from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, CliRecursiveConfig, CliRemoteUrlConfig, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import azure as azure_fn +from unstructured.ingest.runner import AzureRunner @dataclass @@ -60,12 +58,11 @@ def azure_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - AzureCliConfig.from_dict(options) - azure_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=[AzureCliConfig]) + runner = AzureRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -73,12 +70,5 @@ def azure_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = azure_source - AzureCliConfig.add_cli_options(cmd) - CliRemoteUrlConfig.add_cli_options(cmd) - CliRecursiveConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[AzureCliConfig, CliRemoteUrlConfig, CliRecursiveConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/azure_cognitive_search.py b/unstructured/ingest/cli/cmds/azure_cognitive_search.py index 241a66b2ba..ebbcfb2630 100644 --- a/unstructured/ingest/cli/cmds/azure_cognitive_search.py +++ b/unstructured/ingest/cli/cmds/azure_cognitive_search.py @@ -1,20 +1,15 @@ import logging -import types from dataclasses import dataclass import click -from unstructured.ingest.cli.cmds.utils import conform_click_options from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( - CliChunkingConfig, - CliEmbeddingsConfig, CliMixin, - CliPartitionConfig, - CliReadConfig, ) +from unstructured.ingest.cli.utils import conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger from unstructured.ingest.runner import runner_map @@ -72,35 +67,16 @@ def azure_cognitive_search_dest(ctx: click.Context, **options): log_options(parent_options, verbose=verbose) log_options(options, verbose=verbose) try: - read_config = CliReadConfig.from_dict(parent_options) - partition_config = CliPartitionConfig.from_dict(parent_options) - embedding_config = CliEmbeddingsConfig.from_dict(parent_options) - chunking_config = CliChunkingConfig.from_dict(parent_options) - # Run for schema validation - AzureCognitiveSearchCliWriteConfig.from_dict(options) - runner = runner_map[source_cmd] - # TODO update all other runners to implement base runner class - if isinstance(runner, types.FunctionType): - runner( - read_config=read_config, - partition_config=partition_config, - writer_type="s3", - writer_kwargs=options, - **parent_options, - ) - else: - runner_instance = runner( - read_config=read_config, - partition_config=partition_config, - writer_type="azure_cognitive_search", - writer_kwargs=options, - embedding_config=embedding_config, - chunking_config=chunking_config, - ) - runner_instance.run( - **parent_options, - ) - + configs = extract_configs(options, validate=[AzureCognitiveSearchCliWriteConfig]) + runner_cls = runner_map[source_cmd] + runner = runner_cls( + **configs, + writer_type="azure_cognitive_search", + writer_kwargs=options, + ) + runner.run( + **parent_options, + ) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e diff --git a/unstructured/ingest/cli/cmds/biomed.py b/unstructured/ingest/cli/cmds/biomed.py index 77d836070d..2d39ed4da6 100644 --- a/unstructured/ingest/cli/cmds/biomed.py +++ b/unstructured/ingest/cli/cmds/biomed.py @@ -4,18 +4,16 @@ import click -from unstructured.ingest.cli.cmds.utils import Group, conform_click_options from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import biomed as biomed_fn +from unstructured.ingest.runner import BiomedRunner @dataclass @@ -81,12 +79,11 @@ def biomed_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - BiomedCliConfig.from_dict(options) - biomed_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=[BiomedCliConfig]) + runner = BiomedRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -94,10 +91,5 @@ def biomed_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = biomed_source - BiomedCliConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[BiomedCliConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/box.py b/unstructured/ingest/cli/cmds/box.py index 515f65799a..49b90f30aa 100644 --- a/unstructured/ingest/cli/cmds/box.py +++ b/unstructured/ingest/cli/cmds/box.py @@ -4,20 +4,18 @@ import click -from unstructured.ingest.cli.cmds.utils import Group, conform_click_options from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, CliRecursiveConfig, CliRemoteUrlConfig, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import box as box_fn +from unstructured.ingest.runner import BoxRunner @dataclass @@ -47,12 +45,11 @@ def box_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - BoxCliConfig.from_dict(options) - box_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=[BoxCliConfig]) + runner = BoxRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -60,12 +57,5 @@ def box_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = box_source - BoxCliConfig.add_cli_options(cmd) - CliRemoteUrlConfig.add_cli_options(cmd) - CliRecursiveConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[BoxCliConfig, CliRemoteUrlConfig, CliRecursiveConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/confluence.py b/unstructured/ingest/cli/cmds/confluence.py index eaa9b37646..0378c9fdb1 100644 --- a/unstructured/ingest/cli/cmds/confluence.py +++ b/unstructured/ingest/cli/cmds/confluence.py @@ -4,22 +4,17 @@ import click -from unstructured.ingest.cli.cmds.utils import ( - DelimitedString, - Group, - conform_click_options, -) from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, + DelimitedString, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import confluence as confluence_fn +from unstructured.ingest.runner import ConfluenceRunner @dataclass @@ -90,12 +85,11 @@ def confluence_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - ConfluenceCliConfig.from_dict(options) - confluence_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=[ConfluenceCliConfig]) + runner = ConfluenceRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -103,10 +97,5 @@ def confluence_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = confluence_source - ConfluenceCliConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[ConfluenceCliConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/delta_table.py b/unstructured/ingest/cli/cmds/delta_table.py index b3af8a84dd..1335cbe213 100644 --- a/unstructured/ingest/cli/cmds/delta_table.py +++ b/unstructured/ingest/cli/cmds/delta_table.py @@ -4,19 +4,16 @@ import click -from unstructured.ingest.cli.cmds.utils import Group, conform_click_options from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import delta_table as delta_table_fn -from unstructured.ingest.runner import runner_map +from unstructured.ingest.runner import DeltaTableRunner, runner_map @dataclass @@ -68,12 +65,11 @@ def delta_table_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - DeltaTableCliConfig.from_dict(options) - delta_table_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=[DeltaTableCliConfig]) + runner = DeltaTableRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -114,7 +110,6 @@ def delta_table_dest(ctx: click.Context, **options): if not ctx.parent.info_name: raise click.ClickException("parent command missing info name") source_cmd = ctx.parent.info_name.replace("-", "_") - runner_fn = runner_map[source_cmd] parent_options: dict = ctx.parent.params if ctx.parent else {} conform_click_options(options) conform_click_options(parent_options) @@ -123,17 +118,16 @@ def delta_table_dest(ctx: click.Context, **options): log_options(parent_options, verbose=verbose) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(parent_options) - partition_config = CliPartitionConfig.from_dict(parent_options) - # Run for schema validation - DeltaTableCliConfig.from_dict(options) + configs = extract_configs(parent_options, validate=[DeltaTableCliConfig]) + # Validate write configs DeltaTableCliWriteConfig.from_dict(options) - runner_fn( - read_config=read_config, - partition_config=partition_config, + runner_cls = runner_map[source_cmd] + runner = runner_cls( + **configs, writer_type="delta_table", writer_kwargs=options, + ) + runner.run( **parent_options, ) except Exception as e: @@ -150,10 +144,5 @@ def get_dest_cmd() -> click.Command: def get_source_cmd() -> click.Group: cmd = delta_table_source - DeltaTableCliConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[DeltaTableCliConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/discord.py b/unstructured/ingest/cli/cmds/discord.py index 421d4e8c5a..373557971f 100644 --- a/unstructured/ingest/cli/cmds/discord.py +++ b/unstructured/ingest/cli/cmds/discord.py @@ -4,22 +4,17 @@ import click -from unstructured.ingest.cli.cmds.utils import ( - DelimitedString, - Group, - conform_click_options, -) from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, + DelimitedString, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import discord as discord_fn +from unstructured.ingest.runner import DiscordRunner @dataclass @@ -64,12 +59,11 @@ def discord_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - DiscordCliConfig.from_dict(options) - discord_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=[DiscordCliConfig]) + runner = DiscordRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -77,10 +71,5 @@ def discord_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = discord_source - DiscordCliConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[DiscordCliConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/dropbox.py b/unstructured/ingest/cli/cmds/dropbox.py index 3c1ddf73fc..f06c263de4 100644 --- a/unstructured/ingest/cli/cmds/dropbox.py +++ b/unstructured/ingest/cli/cmds/dropbox.py @@ -3,20 +3,18 @@ import click -from unstructured.ingest.cli.cmds.utils import Group, conform_click_options from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, CliRecursiveConfig, CliRemoteUrlConfig, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import dropbox as dropbox_fn +from unstructured.ingest.runner import DropboxRunner @dataclass @@ -46,12 +44,11 @@ def dropbox_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - DropboxCliConfig.from_dict(options) - dropbox_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=[DropboxCliConfig]) + runner = DropboxRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -59,12 +56,5 @@ def dropbox_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = dropbox_source - DropboxCliConfig.add_cli_options(cmd) - CliRemoteUrlConfig.add_cli_options(cmd) - CliRecursiveConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[DropboxCliConfig, CliRemoteUrlConfig, CliRecursiveConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/elasticsearch.py b/unstructured/ingest/cli/cmds/elasticsearch.py index bb71579a78..97f9162b2b 100644 --- a/unstructured/ingest/cli/cmds/elasticsearch.py +++ b/unstructured/ingest/cli/cmds/elasticsearch.py @@ -4,18 +4,16 @@ import click -from unstructured.ingest.cli.cmds.utils import Group, conform_click_options from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import elasticsearch as elasticsearch_fn +from unstructured.ingest.runner import ElasticSearchRunner @dataclass @@ -63,12 +61,11 @@ def elasticsearch_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - ElasticsearchCliConfig.from_dict(options) - elasticsearch_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=[ElasticsearchCliConfig]) + runner = ElasticSearchRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -76,10 +73,5 @@ def elasticsearch_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = elasticsearch_source - ElasticsearchCliConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[ElasticsearchCliConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/fsspec.py b/unstructured/ingest/cli/cmds/fsspec.py index 889e582851..f4ad0b1313 100644 --- a/unstructured/ingest/cli/cmds/fsspec.py +++ b/unstructured/ingest/cli/cmds/fsspec.py @@ -2,18 +2,16 @@ import click -from unstructured.ingest.cli.cmds.utils import Group, conform_click_options from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( - CliPartitionConfig, - CliReadConfig, CliRecursiveConfig, CliRemoteUrlConfig, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import fsspec as fsspec_fn +from unstructured.ingest.runner import FsspecRunner @click.group(name="fsspec", invoke_without_command=True, cls=Group) @@ -27,10 +25,11 @@ def fsspec_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - fsspec_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options) + runner = FsspecRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -38,11 +37,5 @@ def fsspec_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = fsspec_source - CliRemoteUrlConfig.add_cli_options(cmd) - CliRecursiveConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[CliRemoteUrlConfig, CliRecursiveConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/gcs.py b/unstructured/ingest/cli/cmds/gcs.py index f5d8e1bd12..0b549004e3 100644 --- a/unstructured/ingest/cli/cmds/gcs.py +++ b/unstructured/ingest/cli/cmds/gcs.py @@ -4,20 +4,18 @@ import click -from unstructured.ingest.cli.cmds.utils import Group, conform_click_options from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, CliRecursiveConfig, CliRemoteUrlConfig, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import gcs as gcs_fn +from unstructured.ingest.runner import GCSRunner @dataclass @@ -49,12 +47,11 @@ def gcs_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - GcsCliConfig.from_dict(options) - gcs_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=([GcsCliConfig])) + runner = GCSRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -62,12 +59,5 @@ def gcs_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = gcs_source - GcsCliConfig.add_cli_options(cmd) - CliRemoteUrlConfig.add_cli_options(cmd) - CliRecursiveConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[GcsCliConfig, CliRemoteUrlConfig, CliRecursiveConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/github.py b/unstructured/ingest/cli/cmds/github.py index 86ea4d7653..ede3539403 100644 --- a/unstructured/ingest/cli/cmds/github.py +++ b/unstructured/ingest/cli/cmds/github.py @@ -4,18 +4,16 @@ import click -from unstructured.ingest.cli.cmds.utils import Group, conform_click_options from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import github as github_fn +from unstructured.ingest.runner import GithubRunner @dataclass @@ -72,12 +70,11 @@ def github_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - GithubCliConfig.from_dict(options) - github_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=([GithubCliConfig])) + runner = GithubRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -85,10 +82,5 @@ def github_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = github_source - GithubCliConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[GithubCliConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/gitlab.py b/unstructured/ingest/cli/cmds/gitlab.py index 92ed7928d9..b971d83bdd 100644 --- a/unstructured/ingest/cli/cmds/gitlab.py +++ b/unstructured/ingest/cli/cmds/gitlab.py @@ -4,18 +4,16 @@ import click -from unstructured.ingest.cli.cmds.utils import Group, conform_click_options from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import gitlab as gitlab_fn +from unstructured.ingest.runner import GitlabRunner @dataclass @@ -72,12 +70,11 @@ def gitlab_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - GitlabCliConfig.from_dict(options) - gitlab_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=([GitlabCliConfig])) + runner = GitlabRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -85,10 +82,5 @@ def gitlab_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = gitlab_source - GitlabCliConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[GitlabCliConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/google_drive.py b/unstructured/ingest/cli/cmds/google_drive.py index ada078288f..2a8551355a 100644 --- a/unstructured/ingest/cli/cmds/google_drive.py +++ b/unstructured/ingest/cli/cmds/google_drive.py @@ -4,19 +4,17 @@ import click -from unstructured.ingest.cli.cmds.utils import Group, conform_click_options from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, CliRecursiveConfig, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import gdrive as gdrive_fn +from unstructured.ingest.runner import GoogleDriveRunner @dataclass @@ -61,12 +59,11 @@ def google_drive_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - GoogleDriveCliConfig.from_dict(options) - gdrive_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=([GoogleDriveCliConfig])) + runner = GoogleDriveRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -74,11 +71,5 @@ def google_drive_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = google_drive_source - GoogleDriveCliConfig.add_cli_options(cmd) - CliRecursiveConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[GoogleDriveCliConfig, CliRecursiveConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/jira.py b/unstructured/ingest/cli/cmds/jira.py index 76797f1e56..c724a03dc8 100644 --- a/unstructured/ingest/cli/cmds/jira.py +++ b/unstructured/ingest/cli/cmds/jira.py @@ -4,22 +4,17 @@ import click -from unstructured.ingest.cli.cmds.utils import ( - DelimitedString, - Group, - conform_click_options, -) from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, + DelimitedString, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import jira as jira_fn +from unstructured.ingest.runner import JiraRunner @dataclass @@ -92,12 +87,11 @@ def jira_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - JiraCliConfig.from_dict(options) - jira_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=([JiraCliConfig])) + runner = JiraRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -105,10 +99,5 @@ def jira_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = jira_source - JiraCliConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[JiraCliConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/local.py b/unstructured/ingest/cli/cmds/local.py index a3416226b0..1f91d32c85 100644 --- a/unstructured/ingest/cli/cmds/local.py +++ b/unstructured/ingest/cli/cmds/local.py @@ -4,19 +4,17 @@ import click -from unstructured.ingest.cli.cmds.utils import Group, conform_click_options from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, CliRecursiveConfig, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import local as local_fn +from unstructured.ingest.runner import LocalRunner @dataclass @@ -55,12 +53,11 @@ def local_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - LocalCliConfig.from_dict(options) - local_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=([LocalCliConfig])) + runner = LocalRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -68,11 +65,5 @@ def local_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = local_source - LocalCliConfig.add_cli_options(cmd) - CliRecursiveConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[LocalCliConfig, CliRecursiveConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/notion.py b/unstructured/ingest/cli/cmds/notion.py index d5ac645141..a8617c3582 100644 --- a/unstructured/ingest/cli/cmds/notion.py +++ b/unstructured/ingest/cli/cmds/notion.py @@ -4,23 +4,18 @@ import click -from unstructured.ingest.cli.cmds.utils import ( - DelimitedString, - Group, - conform_click_options, -) from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, CliRecursiveConfig, + DelimitedString, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import notion as notion_fn +from unstructured.ingest.runner import NotionRunner @dataclass @@ -65,12 +60,11 @@ def notion_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - NotionCliConfig.from_dict(options) - notion_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=([NotionCliConfig])) + runner = NotionRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -78,11 +72,5 @@ def notion_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = notion_source - NotionCliConfig.add_cli_options(cmd) - CliRecursiveConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[NotionCliConfig, CliRecursiveConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/onedrive.py b/unstructured/ingest/cli/cmds/onedrive.py index fcfe67b871..b2664258b4 100644 --- a/unstructured/ingest/cli/cmds/onedrive.py +++ b/unstructured/ingest/cli/cmds/onedrive.py @@ -4,19 +4,17 @@ import click -from unstructured.ingest.cli.cmds.utils import Group, conform_click_options from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, CliRecursiveConfig, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import onedrive as onedrive_fn +from unstructured.ingest.runner import OneDriveRunner @dataclass @@ -83,12 +81,11 @@ def onedrive_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - OnedriveCliConfig.from_dict(options) - onedrive_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=([OnedriveCliConfig])) + runner = OneDriveRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -96,11 +93,5 @@ def onedrive_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = onedrive_source - OnedriveCliConfig.add_cli_options(cmd) - CliRecursiveConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[OnedriveCliConfig, CliRecursiveConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/outlook.py b/unstructured/ingest/cli/cmds/outlook.py index b3397444f0..67509d41b0 100644 --- a/unstructured/ingest/cli/cmds/outlook.py +++ b/unstructured/ingest/cli/cmds/outlook.py @@ -4,23 +4,18 @@ import click -from unstructured.ingest.cli.cmds.utils import ( - DelimitedString, - Group, - conform_click_options, -) from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, CliRecursiveConfig, + DelimitedString, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import outlook as outlook_fn +from unstructured.ingest.runner import OutlookRunner @dataclass @@ -87,12 +82,11 @@ def outlook_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - OutlookCliConfig.from_dict(options) - outlook_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=([OutlookCliConfig])) + runner = OutlookRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -100,11 +94,5 @@ def outlook_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = outlook_source - OutlookCliConfig.add_cli_options(cmd) - CliRecursiveConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[OutlookCliConfig, CliRecursiveConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/reddit.py b/unstructured/ingest/cli/cmds/reddit.py index d4caea5f87..67bbc7c237 100644 --- a/unstructured/ingest/cli/cmds/reddit.py +++ b/unstructured/ingest/cli/cmds/reddit.py @@ -4,18 +4,16 @@ import click -from unstructured.ingest.cli.cmds.utils import Group, conform_click_options from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import reddit as reddit_fn +from unstructured.ingest.runner import RedditRunner @dataclass @@ -24,6 +22,7 @@ class RedditCliConfig(BaseConfig, CliMixin): client_secret: str subreddit_name: str user_agent: str + num_posts: int search_query: t.Optional[str] = None @staticmethod @@ -57,6 +56,18 @@ def add_cli_options(cmd: click.Command) -> None: type=str, help="If set, return posts using this query. Otherwise, use hot posts.", ), + click.Option( + ["--num-posts"], + required=True, + type=click.IntRange(0), + help="If set, limits the number of posts to pull in.", + ), + click.Option( + ["--user-agent"], + required=True, + type=str, + help="user agent request header to use when calling Reddit API", + ), ] cmd.params.extend(options) @@ -72,12 +83,11 @@ def reddit_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - RedditCliConfig.from_dict(options) - reddit_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=([RedditCliConfig])) + runner = RedditRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -85,10 +95,5 @@ def reddit_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = reddit_source - RedditCliConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[RedditCliConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/s3.py b/unstructured/ingest/cli/cmds/s3.py index 34a7845f1b..572578e396 100644 --- a/unstructured/ingest/cli/cmds/s3.py +++ b/unstructured/ingest/cli/cmds/s3.py @@ -4,20 +4,18 @@ import click -from unstructured.ingest.cli.cmds.utils import Group from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, CliRecursiveConfig, CliRemoteUrlConfig, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import s3 as s3_fn +from unstructured.ingest.runner import S3Runner, runner_map @dataclass @@ -59,12 +57,11 @@ def s3_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - S3CliConfig.from_dict(options) - s3_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=[S3CliConfig]) + s3_runner = S3Runner( + **configs, # type: ignore + ) + s3_runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -73,29 +70,26 @@ def s3_source(ctx: click.Context, **options): @click.command(name="s3") @click.pass_context def s3_dest(ctx: click.Context, **options): + if not ctx.parent: + raise click.ClickException("destination command called without a parent") + if not ctx.parent.info_name: + raise click.ClickException("parent command missing info name") + source_cmd = ctx.parent.info_name.replace("-", "_") parent_options: dict = ctx.parent.params if ctx.parent else {} - # Click sets all multiple fields as tuple, this needs to be updated to list - for k, v in options.items(): - if isinstance(v, tuple): - options[k] = list(v) - for k, v in parent_options.items(): - if isinstance(v, tuple): - parent_options[k] = list(v) + conform_click_options(options) verbose = parent_options.get("verbose", False) ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(parent_options, verbose=verbose) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(parent_options) - partition_config = CliPartitionConfig.from_dict(parent_options) - # Run for schema validation - S3CliConfig.from_dict(options) - s3_fn( - read_config=read_config, - partition_config=partition_config, + configs = extract_configs(options, validate=[S3CliConfig]) + runner_cls = runner_map[source_cmd] + runner = runner_cls( + **configs, writer_type="s3", writer_kwargs=options, + ) + runner.run( **parent_options, ) except Exception as e: @@ -112,12 +106,5 @@ def get_dest_cmd() -> click.Command: def get_source_cmd() -> click.Group: cmd = s3_source - S3CliConfig.add_cli_options(cmd) - CliRemoteUrlConfig.add_cli_options(cmd) - CliRecursiveConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[S3CliConfig, CliRemoteUrlConfig, CliRecursiveConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/salesforce.py b/unstructured/ingest/cli/cmds/salesforce.py index 051ad69aa3..428c90a955 100644 --- a/unstructured/ingest/cli/cmds/salesforce.py +++ b/unstructured/ingest/cli/cmds/salesforce.py @@ -4,23 +4,18 @@ import click -from unstructured.ingest.cli.cmds.utils import ( - DelimitedString, - Group, - conform_click_options, -) from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, CliRecursiveConfig, + DelimitedString, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import salesforce as salesforce_fn +from unstructured.ingest.runner import SalesforceRunner @dataclass @@ -76,12 +71,11 @@ def salesforce_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - SalesforceCliConfig.from_dict(options) - salesforce_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(options, validate=([SalesforceCliConfig])) + runner = SalesforceRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -89,11 +83,5 @@ def salesforce_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = salesforce_source - SalesforceCliConfig.add_cli_options(cmd) - CliRecursiveConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[SalesforceCliConfig, CliRecursiveConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/sharepoint.py b/unstructured/ingest/cli/cmds/sharepoint.py index 5027fe3a80..9d7603c907 100644 --- a/unstructured/ingest/cli/cmds/sharepoint.py +++ b/unstructured/ingest/cli/cmds/sharepoint.py @@ -4,21 +4,17 @@ import click -from unstructured.ingest.cli.cmds.utils import Group, conform_click_options from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( - CliChunkingConfig, - CliEmbeddingsConfig, CliMixin, - CliPartitionConfig, - CliReadConfig, CliRecursiveConfig, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import SharePoint +from unstructured.ingest.runner import SharePointRunner @dataclass @@ -84,18 +80,9 @@ def sharepoint_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - embedding_config = CliEmbeddingsConfig.from_dict(options) - chunking_config = CliChunkingConfig.from_dict(options) - # Run for schema validation - SharepointCliConfig.from_dict(options) - sharepoint_runner = SharePoint( - read_config=read_config, - partition_config=partition_config, - verbose=verbose, - embedding_config=embedding_config, - chunking_config=chunking_config, + configs = extract_configs(data=options, validate=[SharepointCliConfig]) + sharepoint_runner = SharePointRunner( + **configs, # type: ignore ) sharepoint_runner.run(**options) except Exception as e: @@ -105,13 +92,5 @@ def sharepoint_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = sharepoint_source - SharepointCliConfig.add_cli_options(cmd) - CliRecursiveConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - CliEmbeddingsConfig.add_cli_options(cmd) - CliChunkingConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[SharepointCliConfig, CliRecursiveConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/slack.py b/unstructured/ingest/cli/cmds/slack.py index 93c057dec4..a00f1dd65a 100644 --- a/unstructured/ingest/cli/cmds/slack.py +++ b/unstructured/ingest/cli/cmds/slack.py @@ -4,22 +4,17 @@ import click -from unstructured.ingest.cli.cmds.utils import ( - DelimitedString, - Group, - conform_click_options, -) from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, + DelimitedString, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import slack as slack_fn +from unstructured.ingest.runner import SlackRunner @dataclass @@ -75,12 +70,11 @@ def slack_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - SlackCliConfig.from_dict(options) - slack_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(data=options, validate=[SlackCliConfig]) + sharepoint_runner = SlackRunner( + **configs, # type: ignore + ) + sharepoint_runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -88,10 +82,5 @@ def slack_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = slack_source - SlackCliConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[SlackCliConfig]) return cmd diff --git a/unstructured/ingest/cli/cmds/wikipedia.py b/unstructured/ingest/cli/cmds/wikipedia.py index dd05f86243..b132c06843 100644 --- a/unstructured/ingest/cli/cmds/wikipedia.py +++ b/unstructured/ingest/cli/cmds/wikipedia.py @@ -3,18 +3,16 @@ import click -from unstructured.ingest.cli.cmds.utils import Group, conform_click_options from unstructured.ingest.cli.common import ( log_options, ) from unstructured.ingest.cli.interfaces import ( CliMixin, - CliPartitionConfig, - CliReadConfig, ) +from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs from unstructured.ingest.interfaces import BaseConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import wikipedia as wikipedia_fn +from unstructured.ingest.runner import WikipediaRunner @dataclass @@ -53,12 +51,11 @@ def wikipedia_source(ctx: click.Context, **options): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) log_options(options, verbose=verbose) try: - # run_init_checks(**options) - read_config = CliReadConfig.from_dict(options) - partition_config = CliPartitionConfig.from_dict(options) - # Run for schema validation - WikipediaCliConfig.from_dict(options) - wikipedia_fn(read_config=read_config, partition_config=partition_config, **options) + configs = extract_configs(data=options, validate=[WikipediaCliConfig]) + runner = WikipediaRunner( + **configs, # type: ignore + ) + runner.run(**options) except Exception as e: logger.error(e, exc_info=True) raise click.ClickException(str(e)) from e @@ -66,10 +63,5 @@ def wikipedia_source(ctx: click.Context, **options): def get_source_cmd() -> click.Group: cmd = wikipedia_source - WikipediaCliConfig.add_cli_options(cmd) - - # Common CLI configs - CliReadConfig.add_cli_options(cmd) - CliPartitionConfig.add_cli_options(cmd) - cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) + add_options(cmd, extras=[WikipediaCliConfig]) return cmd diff --git a/unstructured/ingest/cli/interfaces.py b/unstructured/ingest/cli/interfaces.py index 7ec4660a6f..2e5ab72c8a 100644 --- a/unstructured/ingest/cli/interfaces.py +++ b/unstructured/ingest/cli/interfaces.py @@ -1,18 +1,56 @@ +import typing as t from abc import abstractmethod +from gettext import ngettext +from pathlib import Path import click from dataclasses_json.core import Json, _decode_dataclass -from unstructured.ingest.cli.cmds.utils import DelimitedString from unstructured.ingest.interfaces import ( BaseConfig, ChunkingConfig, EmbeddingConfig, PartitionConfig, + ProcessorConfig, ReadConfig, ) +class DelimitedString(click.ParamType): + name = "delimited-string" + + def __init__(self, delimiter: str = ",", choices: t.Optional[t.List[str]] = None): + self.choices = choices if choices else [] + self.delimiter = delimiter + + def convert( + self, + value: t.Any, + param: t.Optional[click.Parameter], + ctx: t.Optional[click.Context], + ) -> t.Any: + # In case a list is provided as the default, will not break + if isinstance(value, list): + split = [str(v).strip() for v in value] + else: + split = [v.strip() for v in value.split(self.delimiter)] + if not self.choices: + return split + choices_str = ", ".join(map(repr, self.choices)) + for s in split: + if s not in self.choices: + self.fail( + ngettext( + "{value!r} is not {choice}.", + "{value!r} is not one of {choices}.", + len(self.choices), + ).format(value=s, choice=choices_str, choices=choices_str), + param, + ctx, + ) + return split + + class CliMixin: @staticmethod @abstractmethod @@ -20,6 +58,42 @@ def add_cli_options(cmd: click.Command) -> None: pass +class CliProcessorConfig(ProcessorConfig, CliMixin): + @staticmethod + def add_cli_options(cmd: click.Command) -> None: + options = [ + click.Option( + ["--reprocess"], + is_flag=True, + default=False, + help="Reprocess a downloaded file even if the relevant structured " + "output .json file in output directory already exists.", + ), + click.Option( + ["--output-dir"], + default="structured-output", + help="Where to place structured output .json files.", + ), + click.Option( + ["--work-dir"], + type=str, + default=str( + (Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve(), + ), + show_default=True, + help="Where to place working files when processing each step", + ), + click.Option( + ["--num-processes"], + default=2, + show_default=True, + help="Number of parallel processes with which to process docs", + ), + click.Option(["-v", "--verbose"], is_flag=True, default=False), + ] + cmd.params.extend(options) + + class CliReadConfig(ReadConfig, CliMixin): @staticmethod def add_cli_options(cmd: click.Command) -> None: @@ -51,6 +125,12 @@ def add_cli_options(cmd: click.Command) -> None: "is not specified and " "skip processing them through unstructured.", ), + click.Option( + ["--max-docs"], + default=None, + type=int, + help="If specified, process at most the specified number of documents.", + ), ] cmd.params.extend(options) @@ -59,23 +139,6 @@ class CliPartitionConfig(PartitionConfig, CliMixin): @staticmethod def add_cli_options(cmd: click.Command) -> None: options = [ - click.Option( - ["--output-dir"], - default="structured-output", - help="Where to place structured output .json files.", - ), - click.Option( - ["--num-processes"], - default=2, - show_default=True, - help="Number of parallel processes to process docs in.", - ), - click.Option( - ["--max-docs"], - default=None, - type=int, - help="If specified, process at most specified number of documents.", - ), click.Option( ["--pdf-infer-table-structure"], default=False, @@ -88,13 +151,6 @@ def add_cli_options(cmd: click.Command) -> None: help="The method that will be used to process the documents. " "Default: auto. Other strategies include `fast` and `hi_res`.", ), - click.Option( - ["--reprocess"], - is_flag=True, - default=False, - help="Reprocess a downloaded file even if the relevant structured " - "output .json file in output directory already exists.", - ), click.Option( ["--ocr-languages"], default="eng", @@ -193,7 +249,7 @@ def add_cli_options(cmd: click.Command) -> None: cmd.params.extend(options) -class CliEmbeddingsConfig(EmbeddingConfig, CliMixin): +class CliEmbeddingConfig(EmbeddingConfig, CliMixin): @staticmethod def add_cli_options(cmd: click.Command) -> None: options = [ @@ -229,6 +285,8 @@ def from_dict( } if len(new_kvs.keys()) == 0: return None + if not new_kvs.get("api_key", None): + return None return _decode_dataclass(cls, new_kvs, infer_missing) return _decode_dataclass(cls, kvs, infer_missing) @@ -277,7 +335,10 @@ def from_dict( if isinstance(kvs, dict): new_kvs = {} if "chunk_elements" in kvs: - new_kvs["chunk_elements"] = kvs.pop("chunk_elements") + chunk_elements = kvs.pop("chunk_elements") + if not chunk_elements: + return None + new_kvs["chunk_elements"] = chunk_elements new_kvs.update( { k[len("chunking_") :]: v # noqa: E203 diff --git a/unstructured/ingest/cli/cmds/utils.py b/unstructured/ingest/cli/utils.py similarity index 59% rename from unstructured/ingest/cli/cmds/utils.py rename to unstructured/ingest/cli/utils.py index dcf85fedd8..e04ce48d14 100644 --- a/unstructured/ingest/cli/cmds/utils.py +++ b/unstructured/ingest/cli/utils.py @@ -1,9 +1,20 @@ import typing as t from gettext import gettext as _ -from gettext import ngettext import click +from unstructured.ingest.cli.interfaces import ( + CliChunkingConfig, + CliEmbeddingConfig, + CliMixin, + CliPartitionConfig, + CliProcessorConfig, + CliReadConfig, +) +from unstructured.ingest.interfaces import ( + BaseConfig, +) + def conform_click_options(options: dict): # Click sets all multiple fields as tuple, this needs to be updated to list @@ -12,39 +23,40 @@ def conform_click_options(options: dict): options[k] = list(v) -class DelimitedString(click.ParamType): - name = "delimited-string" - - def __init__(self, delimiter: str = ",", choices: t.Optional[t.List[str]] = None): - self.choices = choices if choices else [] - self.delimiter = delimiter - - def convert( - self, - value: t.Any, - param: t.Optional[click.Parameter], - ctx: t.Optional[click.Context], - ) -> t.Any: - # In case a list is provided as the default, will not break - if isinstance(value, list): - split = [str(v).strip() for v in value] - else: - split = [v.strip() for v in value.split(self.delimiter)] - if not self.choices: - return split - choices_str = ", ".join(map(repr, self.choices)) - for s in split: - if s not in self.choices: - self.fail( - ngettext( - "{value!r} is not {choice}.", - "{value!r} is not one of {choices}.", - len(self.choices), - ).format(value=s, choice=choices_str, choices=choices_str), - param, - ctx, - ) - return split +def extract_configs( + data: dict, + validate: t.Optional[t.List[t.Type[BaseConfig]]] = None, +) -> t.Dict[str, BaseConfig]: + """ + Extract all common configs used across CLI command and validate that any + command-specific configs have all their needed information from the Click + options that are passed in during invocation. + """ + validate = validate if validate else [] + res = { + "read_config": CliReadConfig.from_dict(data), + "partition_config": CliPartitionConfig.from_dict(data), + "embedding_config": CliEmbeddingConfig.from_dict(data), + "chunking_config": CliChunkingConfig.from_dict(data), + "processor_config": CliProcessorConfig.from_dict(data), + } + for v in validate: + v.from_dict(data) + return res + + +def add_options(cmd: click.Command, extras=t.List[t.Type[CliMixin]]) -> click.Command: + configs: t.List[t.Type[CliMixin]] = [ + CliPartitionConfig, + CliReadConfig, + CliEmbeddingConfig, + CliChunkingConfig, + CliProcessorConfig, + ] + configs.extend(extras) + for config in configs: + config.add_cli_options(cmd) + return cmd class Group(click.Group): diff --git a/unstructured/ingest/connector/airtable.py b/unstructured/ingest/connector/airtable.py index bdae5d2f9d..c3dc0b9eb3 100644 --- a/unstructured/ingest/connector/airtable.py +++ b/unstructured/ingest/connector/airtable.py @@ -65,7 +65,7 @@ def filename(self): def _output_filename(self): """Create output file path based on output directory, base id, and table id""" output_file = f"{self.table_meta.table_id}.json" - return Path(self.partition_config.output_dir) / self.table_meta.base_id / output_file + return Path(self.processor_config.output_dir) / self.table_meta.base_id / output_file @property def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]: @@ -274,8 +274,8 @@ def get_ingest_docs(self): baseid_tableid_viewid_tuples += self.fetch_table_ids() return [ AirtableIngestDoc( + processor_config=self.processor_config, connector_config=self.connector_config, - partition_config=self.partition_config, read_config=self.read_config, table_meta=AirtableTableMeta(base_id, table_id, view_id), ) diff --git a/unstructured/ingest/connector/azure.py b/unstructured/ingest/connector/azure.py index 4a503a7835..004ca782b4 100644 --- a/unstructured/ingest/connector/azure.py +++ b/unstructured/ingest/connector/azure.py @@ -30,7 +30,9 @@ def get_file(self): @dataclass class AzureBlobStorageSourceConnector(FsspecSourceConnector): connector_config: SimpleAzureBlobStorageConfig - ingest_doc_cls: t.Type[AzureBlobStorageIngestDoc] = AzureBlobStorageIngestDoc + + def __post_init__(self): + self.ingest_doc_cls: t.Type[AzureBlobStorageIngestDoc] = AzureBlobStorageIngestDoc @dataclass diff --git a/unstructured/ingest/connector/biomed.py b/unstructured/ingest/connector/biomed.py index 149b51014c..d817c4893d 100644 --- a/unstructured/ingest/connector/biomed.py +++ b/unstructured/ingest/connector/biomed.py @@ -164,7 +164,7 @@ def urls_to_metadata(urls): download_filepath=(Path(self.read_config.download_dir) / local_path) .resolve() .as_posix(), - output_filepath=(Path(self.partition_config.output_dir) / local_path) + output_filepath=(Path(self.processor_config.output_dir) / local_path) .resolve() .as_posix(), ), @@ -246,7 +246,7 @@ def traverse(path, download_dir, output_dir): .resolve() .as_posix(), output_filepath=( - Path(self.partition_config.output_dir) / local_path + Path(self.processor_config.output_dir) / local_path ) .resolve() .as_posix(), @@ -269,7 +269,7 @@ def traverse(path, download_dir, output_dir): download_filepath=(Path(self.read_config.download_dir) / local_path) .resolve() .as_posix(), - output_filepath=(Path(self.partition_config.output_dir) / local_path) + output_filepath=(Path(self.processor_config.output_dir) / local_path) .resolve() .as_posix(), ), @@ -278,7 +278,7 @@ def traverse(path, download_dir, output_dir): traverse( Path(path), Path(self.read_config.download_dir), - Path(self.partition_config.output_dir), + Path(self.processor_config.output_dir), ) return files @@ -290,9 +290,9 @@ def get_ingest_docs(self): files = self._list_objects_api() if self.connector_config.is_api else self._list_objects() return [ BiomedIngestDoc( + processor_config=self.processor_config, connector_config=self.connector_config, read_config=self.read_config, - partition_config=self.partition_config, file_meta=file, ) for file in files diff --git a/unstructured/ingest/connector/box.py b/unstructured/ingest/connector/box.py index 899f0b019b..5c63ecd30d 100644 --- a/unstructured/ingest/connector/box.py +++ b/unstructured/ingest/connector/box.py @@ -56,7 +56,9 @@ def get_file(self): @dataclass class BoxSourceConnector(FsspecSourceConnector): connector_config: SimpleBoxConfig - ingest_doc_cls: t.Type[BoxIngestDoc] = BoxIngestDoc + + def __post_init__(self): + self.ingest_doc_cls: t.Type[BoxIngestDoc] = BoxIngestDoc @dataclass diff --git a/unstructured/ingest/connector/confluence.py b/unstructured/ingest/connector/confluence.py index 17466fc75f..874aaa0c09 100644 --- a/unstructured/ingest/connector/confluence.py +++ b/unstructured/ingest/connector/confluence.py @@ -102,7 +102,7 @@ def filename(self): def _output_filename(self): """Create output file path based on output directory, space id and document id.""" output_file = f"{self.document_meta.document_id}.json" - return Path(self.partition_config.output_dir) / self.document_meta.space_id / output_file + return Path(self.processor_config.output_dir) / self.document_meta.space_id / output_file @property def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]: @@ -253,7 +253,7 @@ def get_ingest_docs(self): return [ ConfluenceIngestDoc( connector_config=self.connector_config, - partition_config=self.partition_config, + processor_config=self.processor_config, read_config=self.read_config, document_meta=ConfluenceDocumentMeta(space_id, doc_id), ) diff --git a/unstructured/ingest/connector/delta_table.py b/unstructured/ingest/connector/delta_table.py index 976e5fbddf..2a284fd33a 100644 --- a/unstructured/ingest/connector/delta_table.py +++ b/unstructured/ingest/connector/delta_table.py @@ -28,7 +28,6 @@ @dataclass class SimpleDeltaTableConfig(BaseConnectorConfig): - verbose: bool table_uri: t.Union[str, Path] version: t.Optional[int] = None storage_options: t.Optional[t.Dict[str, str]] = None @@ -60,7 +59,7 @@ def filename(self): def _output_filename(self): """Create filename document id combined with a hash of the query to uniquely identify the output file.""" - return Path(self.partition_config.output_dir) / f"{self.uri_filename()}.json" + return Path(self.processor_config.output_dir) / f"{self.uri_filename()}.json" def _create_full_tmp_dir_path(self): self.filename.parent.mkdir(parents=True, exist_ok=True) @@ -142,7 +141,7 @@ def get_ingest_docs(self): return [ DeltaTableIngestDoc( connector_config=self.connector_config, - partition_config=self.partition_config, + processor_config=self.processor_config, read_config=self.read_config, uri=uri, modified_date=mod_date_dict[os.path.basename(uri)], diff --git a/unstructured/ingest/connector/discord.py b/unstructured/ingest/connector/discord.py index d9b40d3bb6..472cda4337 100644 --- a/unstructured/ingest/connector/discord.py +++ b/unstructured/ingest/connector/discord.py @@ -29,7 +29,6 @@ class SimpleDiscordConfig(BaseConnectorConfig): channels: t.List[str] token: str days: t.Optional[int] - verbose: bool = False def __post_init__(self): if self.days: @@ -65,7 +64,7 @@ def _tmp_download_file(self): @property def _output_filename(self): output_file = self.channel + ".json" - return Path(self.partition_config.output_dir) / output_file + return Path(self.processor_config.output_dir) / output_file def _create_full_tmp_dir_path(self): self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) @@ -122,7 +121,7 @@ def update_source_metadata(self, **kwargs): @BaseIngestDoc.skip_if_file_exists def get_file(self): self._create_full_tmp_dir_path() - if self.connector_config.verbose: + if self.processor_config.verbose: logger.debug(f"fetching {self} - PID: {os.getpid()}") messages, jump_url = self._get_messages() @@ -162,7 +161,7 @@ def get_ingest_docs(self): return [ DiscordIngestDoc( connector_config=self.connector_config, - partition_config=self.partition_config, + processor_config=self.processor_config, read_config=self.read_config, channel=channel, days=self.connector_config.days, diff --git a/unstructured/ingest/connector/dropbox.py b/unstructured/ingest/connector/dropbox.py index f2d41aa5aa..f27376f876 100644 --- a/unstructured/ingest/connector/dropbox.py +++ b/unstructured/ingest/connector/dropbox.py @@ -48,14 +48,14 @@ def _output_filename(self): # creates some complications in path joining so a custom path is created here. # Dropbox uses an empty string `""`, or a space `" "`` or a `" /"` to list root if self.connector_config.dir_path == " ": - return Path(self.partition_config.output_dir) / re.sub( + return Path(self.processor_config.output_dir) / re.sub( "^/", "", f"{self.remote_file_path}.json", ) else: return ( - Path(self.partition_config.output_dir) + Path(self.processor_config.output_dir) / f"{self.remote_file_path.replace(f'/{self.connector_config.dir_path}/', '')}.json" ) @@ -82,7 +82,9 @@ def _tmp_download_file(self): @dataclass class DropboxSourceConnector(FsspecSourceConnector): connector_config: SimpleDropboxConfig - ingest_doc_cls: Type[DropboxIngestDoc] = DropboxIngestDoc + + def __post_init__(self): + self.ingest_doc_cls: Type[DropboxIngestDoc] = DropboxIngestDoc @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox") def initialize(self): diff --git a/unstructured/ingest/connector/elasticsearch.py b/unstructured/ingest/connector/elasticsearch.py index 58e8aeb05b..9b021cbbc8 100644 --- a/unstructured/ingest/connector/elasticsearch.py +++ b/unstructured/ingest/connector/elasticsearch.py @@ -74,7 +74,7 @@ def _output_filename(self): query_hash = hashlib.sha256((self.connector_config.jq_query or "").encode()).hexdigest()[:8] output_file = f"{self.document_meta.document_id}-{query_hash}.json" return ( - Path(self.partition_config.output_dir) / self.connector_config.index_name / output_file + Path(self.processor_config.output_dir) / self.connector_config.index_name / output_file ) # TODO: change test fixtures such that examples with @@ -214,7 +214,7 @@ def get_ingest_docs(self): return [ ElasticsearchIngestDoc( connector_config=self.connector_config, - partition_config=self.partition_config, + processor_config=self.processor_config, read_config=self.read_config, document_meta=ElasticsearchDocumentMeta(self.connector_config.index_name, id), ) diff --git a/unstructured/ingest/connector/fsspec.py b/unstructured/ingest/connector/fsspec.py index be4dee7bf4..3b63de7dff 100644 --- a/unstructured/ingest/connector/fsspec.py +++ b/unstructured/ingest/connector/fsspec.py @@ -3,7 +3,7 @@ import typing as t from contextlib import suppress from dataclasses import dataclass, field -from pathlib import Path +from pathlib import Path, PurePath from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( @@ -100,7 +100,7 @@ def _tmp_download_file(self): @property def _output_filename(self): return ( - Path(self.partition_config.output_dir) + Path(self.processor_config.output_dir) / f"{self.remote_file_path.replace(f'{self.connector_config.dir_path}/', '')}.json" ) @@ -120,10 +120,10 @@ def get_file(self): ) logger.debug(f"Fetching {self} - PID: {os.getpid()}") fs.get(rpath=self.remote_file_path, lpath=self._tmp_download_file().as_posix()) - self.update_source_metadata_metadata() + self.update_source_metadata() @requires_dependencies(["fsspec"]) - def update_source_metadata_metadata(self): + def update_source_metadata(self): from fsspec import AbstractFileSystem, get_filesystem_class fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)( @@ -171,7 +171,9 @@ class FsspecSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): """Objects of this class support fetching document(s) from""" connector_config: SimpleFsspecConfig - ingest_doc_cls: t.Type[FsspecIngestDoc] = FsspecIngestDoc + + def __post_init__(self): + self.ingest_doc_cls: t.Type[FsspecIngestDoc] = FsspecIngestDoc def initialize(self): from fsspec import AbstractFileSystem, get_filesystem_class @@ -212,9 +214,9 @@ def _list_files(self): def get_ingest_docs(self): return [ self.ingest_doc_cls( + processor_config=self.processor_config, read_config=self.read_config, connector_config=self.connector_config, - partition_config=self.partition_config, remote_file_path=file, ) for file in self._list_files() @@ -242,16 +244,9 @@ def write(self, docs: t.List[BaseIngestDoc]) -> None: logger.info(f"Writing content using filesystem: {type(fs).__name__}") for doc in docs: - s3_file_path = str(doc._output_filename).replace( - doc.partition_config.output_dir, - self.connector_config.path, - ) + s3_file_path = doc.base_filename s3_folder = self.connector_config.path - if s3_folder[-1] != "/": - s3_folder = f"{s3_file_path}/" - if s3_file_path[0] == "/": - s3_file_path = s3_file_path[1:] - s3_output_path = s3_folder + s3_file_path + s3_output_path = str(PurePath(s3_folder, s3_file_path)) if s3_file_path else s3_folder logger.debug(f"Uploading {doc._output_filename} -> {s3_output_path}") fs.put_file(lpath=doc._output_filename, rpath=s3_output_path) diff --git a/unstructured/ingest/connector/gcs.py b/unstructured/ingest/connector/gcs.py index 01a0a27ab4..1a75fef2ea 100644 --- a/unstructured/ingest/connector/gcs.py +++ b/unstructured/ingest/connector/gcs.py @@ -30,7 +30,9 @@ def get_file(self): @dataclass class GcsSourceConnector(FsspecSourceConnector): connector_config: SimpleGcsConfig - ingest_doc_cls: Type[GcsIngestDoc] = GcsIngestDoc + + def __post_init__(self): + self.ingest_doc_cls: Type[GcsIngestDoc] = GcsIngestDoc @dataclass diff --git a/unstructured/ingest/connector/git.py b/unstructured/ingest/connector/git.py index 18926f64c2..b1fef6b5c8 100644 --- a/unstructured/ingest/connector/git.py +++ b/unstructured/ingest/connector/git.py @@ -35,7 +35,7 @@ def filename(self): @property def _output_filename(self): - return Path(self.partition_config.output_dir) / f"{self.path}.json" + return Path(self.processor_config.output_dir) / f"{self.path}.json" def _create_full_tmp_dir_path(self): """includes directories in in the gitlab repository""" diff --git a/unstructured/ingest/connector/github.py b/unstructured/ingest/connector/github.py index 8cdad07c76..d141279a16 100644 --- a/unstructured/ingest/connector/github.py +++ b/unstructured/ingest/connector/github.py @@ -87,7 +87,7 @@ def get_ingest_docs(self): return [ GitHubIngestDoc( connector_config=self.connector_config, - partition_config=self.partition_config, + processor_config=self.processor_config, read_config=self.read_config, path=element.path, ) diff --git a/unstructured/ingest/connector/gitlab.py b/unstructured/ingest/connector/gitlab.py index 97790deaf4..8db9270c25 100644 --- a/unstructured/ingest/connector/gitlab.py +++ b/unstructured/ingest/connector/gitlab.py @@ -72,7 +72,7 @@ def get_ingest_docs(self): return [ GitLabIngestDoc( connector_config=self.connector_config, - partition_config=self.partition_config, + processor_config=self.processor_config, read_config=self.read_config, path=element["path"], ) diff --git a/unstructured/ingest/connector/google_drive.py b/unstructured/ingest/connector/google_drive.py index 3144677e68..61082bc1fb 100644 --- a/unstructured/ingest/connector/google_drive.py +++ b/unstructured/ingest/connector/google_drive.py @@ -305,7 +305,7 @@ def traverse(drive_id, download_dir, output_dir, recursive=False): traverse( drive_id, Path(self.read_config.download_dir), - Path(self.partition_config.output_dir), + Path(self.processor_config.output_dir), recursive, ) return files @@ -318,7 +318,7 @@ def get_ingest_docs(self): return [ GoogleDriveIngestDoc( connector_config=self.connector_config, - partition_config=self.partition_config, + processor_config=self.processor_config, read_config=self.read_config, meta=file, ) diff --git a/unstructured/ingest/connector/jira.py b/unstructured/ingest/connector/jira.py index 7254584542..c8a5126124 100644 --- a/unstructured/ingest/connector/jira.py +++ b/unstructured/ingest/connector/jira.py @@ -293,7 +293,7 @@ def _output_filename(self): output_file = f"{self.file_meta.issue_id}.json" return ( - Path(self.partition_config.output_dir) / self.grouping_folder_name / output_file + Path(self.processor_config.output_dir) / self.grouping_folder_name / output_file ).resolve() @property @@ -436,7 +436,7 @@ def get_ingest_docs(self): return [ JiraIngestDoc( connector_config=self.connector_config, - partition_config=self.partition_config, + processor_config=self.processor_config, read_config=self.read_config, file_meta=JiraFileMeta( issue_id=issue_id, diff --git a/unstructured/ingest/connector/local.py b/unstructured/ingest/connector/local.py index aee8a82a97..c4339b8110 100644 --- a/unstructured/ingest/connector/local.py +++ b/unstructured/ingest/connector/local.py @@ -62,7 +62,7 @@ def _output_filename(self) -> Path: if input_path.is_file() else f"{Path(self.path).relative_to(input_path)}.json" ) - return Path(self.partition_config.output_dir) / basename + return Path(self.processor_config.output_dir) / basename @dataclass @@ -105,7 +105,7 @@ def get_ingest_docs(self): return [ self.ingest_doc_cls( connector_config=self.connector_config, - partition_config=self.partition_config, + processor_config=self.processor_config, read_config=self.read_config, path=file, ) diff --git a/unstructured/ingest/connector/notion/connector.py b/unstructured/ingest/connector/notion/connector.py index d9b9ac84a0..ff17a57d1a 100644 --- a/unstructured/ingest/connector/notion/connector.py +++ b/unstructured/ingest/connector/notion/connector.py @@ -24,7 +24,6 @@ class SimpleNotionConfig(BaseConnectorConfig): database_ids: t.List[str] recursive: bool api_key: str - verbose: bool @dataclass @@ -48,7 +47,7 @@ def _tmp_download_file(self): @property def _output_filename(self): page_file = self.page_id + ".json" - return Path(self.partition_config.output_dir) / page_file + return Path(self.processor_config.output_dir) / page_file def _create_full_tmp_dir_path(self): self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) @@ -160,7 +159,7 @@ def _tmp_download_file(self): @property def _output_filename(self): page_file = self.database_id + ".json" - return Path(self.partition_config.output_dir) / page_file + return Path(self.processor_config.output_dir) / page_file def _create_full_tmp_dir_path(self): self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) @@ -332,7 +331,7 @@ def get_ingest_docs(self): docs += [ NotionPageIngestDoc( connector_config=self.connector_config, - partition_config=self.partition_config, + processor_config=self.processor_config, read_config=self.read_config, page_id=page_id, api_key=self.connector_config.api_key, @@ -343,7 +342,7 @@ def get_ingest_docs(self): docs += [ NotionDatabaseIngestDoc( connector_config=self.connector_config, - partition_config=self.partition_config, + processor_config=self.processor_config, read_config=self.read_config, database_id=database_id, api_key=self.connector_config.api_key, @@ -380,7 +379,7 @@ def get_ingest_docs(self): docs += [ NotionPageIngestDoc( connector_config=self.connector_config, - partition_config=self.partition_config, + processor_config=self.processor_config, read_config=self.read_config, page_id=page_id, api_key=self.connector_config.api_key, @@ -397,7 +396,7 @@ def get_ingest_docs(self): docs += [ NotionDatabaseIngestDoc( connector_config=self.connector_config, - partition_config=self.partition_config, + processor_config=self.processor_config, read_config=self.read_config, database_id=database_id, api_key=self.connector_config.api_key, diff --git a/unstructured/ingest/connector/onedrive.py b/unstructured/ingest/connector/onedrive.py index b58d18cc66..9212182f7b 100644 --- a/unstructured/ingest/connector/onedrive.py +++ b/unstructured/ingest/connector/onedrive.py @@ -82,7 +82,7 @@ def __post_init__(self): def _set_download_paths(self) -> None: """Parses the folder structure from the source and creates the download and output paths""" download_path = Path(f"{self.read_config.download_dir}") - output_path = Path(f"{self.partition_config.output_dir}") + output_path = Path(f"{self.processor_config.output_dir}") if parent_path := self.file_path: download_path = ( @@ -204,7 +204,7 @@ def _gen_ingest_doc(self, file: "DriveItem") -> OneDriveIngestDoc: file_path = file_path[1:] if file_path[0] == "/" else file_path return OneDriveIngestDoc( connector_config=self.connector_config, - partition_config=self.partition_config, + processor_config=self.processor_config, read_config=self.read_config, file_name=file.name, file_path=file_path, diff --git a/unstructured/ingest/connector/outlook.py b/unstructured/ingest/connector/outlook.py index cfa12d08f5..59ab4868ca 100644 --- a/unstructured/ingest/connector/outlook.py +++ b/unstructured/ingest/connector/outlook.py @@ -87,7 +87,7 @@ def hash_mail_name(self, id): def _set_download_paths(self) -> None: """Creates paths for downloading and parsing.""" download_path = Path(f"{self.read_config.download_dir}") - output_path = Path(f"{self.partition_config.output_dir}") + output_path = Path(f"{self.processor_config.output_dir}") self.download_dir = download_path self.download_filepath = ( @@ -247,7 +247,7 @@ def get_ingest_docs(self): return [ OutlookIngestDoc( connector_config=self.connector_config, - partition_config=self.partition_config, + processor_config=self.processor_config, read_config=self.read_config, message_id=message.id, ) diff --git a/unstructured/ingest/connector/reddit.py b/unstructured/ingest/connector/reddit.py index 2de114e423..2cc5c9501f 100644 --- a/unstructured/ingest/connector/reddit.py +++ b/unstructured/ingest/connector/reddit.py @@ -99,7 +99,7 @@ def filename(self) -> Path: @property def _output_filename(self): - return Path(self.partition_config.output_dir) / f"{self.post_id}.json" + return Path(self.processor_config.output_dir) / f"{self.post_id}.json" @property def date_modified(self) -> t.Optional[str]: @@ -136,7 +136,7 @@ def get_ingest_docs(self): return [ RedditIngestDoc( connector_config=self.connector_config, - partition_config=self.partition_config, + processor_config=self.processor_config, read_config=self.read_config, post_id=post.id, ) diff --git a/unstructured/ingest/connector/s3.py b/unstructured/ingest/connector/s3.py index 7dabc0ac67..b3699025f0 100644 --- a/unstructured/ingest/connector/s3.py +++ b/unstructured/ingest/connector/s3.py @@ -29,7 +29,9 @@ def get_file(self): @dataclass class S3SourceConnector(FsspecSourceConnector): connector_config: SimpleS3Config - ingest_doc_cls: Type[S3IngestDoc] = S3IngestDoc + + def __post_init__(self): + self.ingest_doc_cls: Type[S3IngestDoc] = S3IngestDoc @dataclass diff --git a/unstructured/ingest/connector/salesforce.py b/unstructured/ingest/connector/salesforce.py index 35c92a43b1..4db651708b 100644 --- a/unstructured/ingest/connector/salesforce.py +++ b/unstructured/ingest/connector/salesforce.py @@ -10,7 +10,7 @@ import os import typing as t from collections import OrderedDict -from dataclasses import dataclass +from dataclasses import dataclass, field from datetime import datetime from email.utils import formatdate from pathlib import Path @@ -87,6 +87,13 @@ class SalesforceIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): record_type: str record_id: str registry_name: str = "salesforce" + _record: OrderedDict = field(default_factory=lambda: OrderedDict()) + + @property + def record(self): + if not self._record: + self._record = self.get_record() + return self._record def _tmp_download_file(self) -> Path: if self.record_type == "EmailMessage": @@ -102,7 +109,7 @@ def _tmp_download_file(self) -> Path: @property def _output_filename(self) -> Path: record_file = self.record_id + ".json" - return Path(self.partition_config.output_dir) / self.record_type / record_file + return Path(self.processor_config.output_dir) / self.record_type / record_file def _create_full_tmp_dir_path(self): self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) @@ -142,7 +149,23 @@ def _eml_for_record(self, email_json: t.Dict[str, t.Any]) -> str: ) return dedent(eml) - def update_source_metadata(self, record_json: t.Dict[str, t.Any]) -> None: # type: ignore + def get_record(self) -> OrderedDict: + client = self.connector_config.get_client() + + # Get record from Salesforce based on id + response = client.query_all( + f"select FIELDS(STANDARD) from {self.record_type} where Id='{self.record_id}'", + ) + logger.debug(f"response from salesforce record request: {response}") + records = response["records"] + if not records: + raise ValueError(f"No record found with record id {self.record_id}: {response}") + record_json = records[0] + return record_json + + def update_source_metadata(self) -> None: # type: ignore + record_json = self.record + date_format = "%Y-%m-%dT%H:%M:%S.000+0000" self.source_metadata = SourceMetadata( date_created=datetime.strptime(record_json["CreatedDate"], date_format).isoformat(), @@ -163,14 +186,9 @@ def get_file(self): self._create_full_tmp_dir_path() logger.debug(f"Writing file {self.record_id} - PID: {os.getpid()}") - client = self.connector_config.get_client() - - # Get record from Salesforce based on id - record = client.query_all( - f"select FIELDS(STANDARD) from {self.record_type} where Id='{self.record_id}'", - )["records"][0] + record = self.record - self.update_source_metadata(record) + self.update_source_metadata() try: if self.record_type == "EmailMessage": @@ -225,7 +243,7 @@ def get_ingest_docs(self) -> t.List[SalesforceIngestDoc]: ingest_docs.append( SalesforceIngestDoc( connector_config=self.connector_config, - partition_config=self.partition_config, + processor_config=self.processor_config, read_config=self.read_config, record_type=record_type, record_id=record["Id"], diff --git a/unstructured/ingest/connector/sharepoint.py b/unstructured/ingest/connector/sharepoint.py index 9fdcf87c9e..e80bed98bd 100644 --- a/unstructured/ingest/connector/sharepoint.py +++ b/unstructured/ingest/connector/sharepoint.py @@ -5,16 +5,12 @@ from pathlib import Path from urllib.parse import urlparse -from unstructured.documents.elements import Element -from unstructured.embed.interfaces import BaseEmbeddingEncoder from unstructured.file_utils.filetype import EXT_TO_FILETYPE from unstructured.ingest.error import SourceConnectionError from unstructured.ingest.interfaces import ( BaseConnectorConfig, BaseIngestDoc, BaseSourceConnector, - ChunkingConfig, - EmbeddingConfig, IngestDocCleanupMixin, SourceConnectorCleanupMixin, SourceMetadata, @@ -70,26 +66,6 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): is_page: bool file_path: str registry_name: str = "sharepoint" - embedding_config: t.Optional[EmbeddingConfig] = None - chunking_config: t.Optional[ChunkingConfig] = None - - def run_chunking(self, elements: t.List[Element]) -> t.List[Element]: - if self.chunking_config: - logger.info( - "Running chunking to split up elements with config: " - f"{self.chunking_config.to_dict()}", - ) - chunked_elements = self.chunking_config.chunk(elements=elements) - logger.info(f"chunked {len(elements)} elements into {len(chunked_elements)}") - return chunked_elements - else: - return elements - - @property - def embedder(self) -> t.Optional[BaseEmbeddingEncoder]: - if self.embedding_config and self.embedding_config.api_key: - return self.embedding_config.get_embedder() - return None def __post_init__(self): self.extension = Path(self.file_path).suffix if not self.is_page else ".html" @@ -107,7 +83,7 @@ def __post_init__(self): def _set_download_paths(self) -> None: """Parses the folder structure from the source and creates the download and output paths""" download_path = Path(f"{self.read_config.download_dir}") - output_path = Path(f"{self.partition_config.output_dir}") + output_path = Path(f"{self.processor_config.output_dir}") parent = Path(self.file_path).with_suffix(self.extension) self.download_dir = (download_path / parent.parent).resolve() self.download_filepath = (download_path / parent).resolve() @@ -258,8 +234,6 @@ def get_file(self): @dataclass class SharepointSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): connector_config: SimpleSharepointConfig - embedding_config: t.Optional[EmbeddingConfig] = None - chunking_config: t.Optional[ChunkingConfig] = None @requires_dependencies(["office365"], extras="sharepoint") def _list_files(self, folder, recursive) -> t.List["File"]: @@ -291,15 +265,13 @@ def _prepare_ingest_doc(self, obj: t.Union["File", "SitePage"], base_url, is_pag file_path = obj.serverRelativeUrl[1:] return SharepointIngestDoc( + processor_config=self.processor_config, read_config=self.read_config, - partition_config=self.partition_config, connector_config=self.connector_config, site_url=base_url, server_path=server_path, is_page=is_page, file_path=file_path, - embedding_config=self.embedding_config, - chunking_config=self.chunking_config, ) @requires_dependencies(["office365"], extras="sharepoint") diff --git a/unstructured/ingest/connector/slack.py b/unstructured/ingest/connector/slack.py index 77ac53809e..fa1aba03ce 100644 --- a/unstructured/ingest/connector/slack.py +++ b/unstructured/ingest/connector/slack.py @@ -31,7 +31,6 @@ class SimpleSlackConfig(BaseConnectorConfig): token: str oldest: t.Optional[str] latest: t.Optional[str] - verbose: bool = False def validate_inputs(self): oldest_valid = True @@ -79,7 +78,7 @@ def _tmp_download_file(self): @property def _output_filename(self): output_file = self.channel + ".json" - return Path(self.partition_config.output_dir) / output_file + return Path(self.processor_config.output_dir) / output_file @property def version(self) -> t.Optional[str]: @@ -150,7 +149,7 @@ def get_file(self): self._create_full_tmp_dir_path() - if self.connector_config.verbose: + if self.processor_config.verbose: logger.debug(f"fetching channel {self.channel} - PID: {os.getpid()}") result = self._fetch_messages() @@ -211,7 +210,7 @@ def get_ingest_docs(self): return [ SlackIngestDoc( connector_config=self.connector_config, - partition_config=self.partition_config, + processor_config=self.processor_config, read_config=self.read_config, channel=channel, token=self.connector_config.token, diff --git a/unstructured/ingest/connector/wikipedia.py b/unstructured/ingest/connector/wikipedia.py index d07415e7e8..3e20dfafd6 100644 --- a/unstructured/ingest/connector/wikipedia.py +++ b/unstructured/ingest/connector/wikipedia.py @@ -115,7 +115,7 @@ def text(self): @property def _output_filename(self): return ( - Path(self.partition_config.output_dir) + Path(self.processor_config.output_dir) / f"{self.page.title}-{self.page.revision_id}-html.json" ) @@ -137,7 +137,7 @@ def text(self): @property def _output_filename(self): return ( - Path(self.partition_config.output_dir) + Path(self.processor_config.output_dir) / f"{self.page.title}-{self.page.revision_id}-txt.json" ) @@ -160,7 +160,7 @@ def text(self): @property def _output_filename(self): return ( - Path(self.partition_config.output_dir) + Path(self.processor_config.output_dir) / f"{self.page.title}-{self.page.revision_id}-summary.json" ) @@ -175,18 +175,18 @@ def initialize(self): def get_ingest_docs(self): return [ WikipediaIngestTextDoc( + processor_config=self.processor_config, connector_config=self.connector_config, - partition_config=self.partition_config, read_config=self.read_config, ), WikipediaIngestHTMLDoc( + processor_config=self.processor_config, connector_config=self.connector_config, - partition_config=self.partition_config, read_config=self.read_config, ), WikipediaIngestSummaryDoc( + processor_config=self.processor_config, connector_config=self.connector_config, - partition_config=self.partition_config, read_config=self.read_config, ), ] diff --git a/unstructured/ingest/doc_processor/__init__.py b/unstructured/ingest/doc_processor/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/unstructured/ingest/doc_processor/generalized.py b/unstructured/ingest/doc_processor/generalized.py deleted file mode 100644 index f44b2fa8f4..0000000000 --- a/unstructured/ingest/doc_processor/generalized.py +++ /dev/null @@ -1,70 +0,0 @@ -"""Process arbitrary files with the Unstructured library""" - -import os -from typing import Any, Dict, List, Optional - -from unstructured_inference.models.base import get_model - -from unstructured.ingest.connector.registry import create_ingest_doc_from_json -from unstructured.ingest.interfaces import ( - BaseSessionHandle, - IngestDocSessionHandleMixin, -) -from unstructured.ingest.logger import logger - -# module-level variable to store session handle -session_handle: Optional[BaseSessionHandle] = None - - -def initialize(): - """Download default model or model specified by UNSTRUCTURED_HI_RES_MODEL_NAME environment - variable (avoids subprocesses all doing the same)""" - - # If more than one model will be supported and left up to user selection - supported_model = os.environ.get("UNSTRUCTURED_HI_RES_SUPPORTED_MODEL", "") - if supported_model: - for model_name in supported_model.split(","): - get_model(model_name=model_name) - - get_model(os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME")) - - -def process_document(ingest_doc_json: str, **partition_kwargs) -> Optional[List[Dict[str, Any]]]: - """Process the serialized json for any IngestDoc-like class of document with chosen - Unstructured partition logic. - - Parameters - ---------- - partition_kwargs - ultimately the parameters passed to partition() - """ - global session_handle - isd_elems_no_filename = None - doc = None - try: - doc = create_ingest_doc_from_json(ingest_doc_json) - if isinstance(doc, IngestDocSessionHandleMixin): - if session_handle is None: - # create via doc.session_handle, which is a property that creates a - # session handle if one is not already defined - session_handle = doc.session_handle - else: - doc.session_handle = session_handle - # does the work necessary to load file into filesystem - # in the future, get_file_handle() could also be supported - doc.get_file() - - isd_elems_no_filename = doc.process_file(**partition_kwargs) - - # Note, this may be a no-op if the IngestDoc doesn't do anything to persist - # the results. Instead, the Processor (caller) may work with the aggregate - # results across all docs in memory. - doc.write_result() - except Exception: - # TODO(crag) save the exception instead of print? - logger.error(f"Failed to process {doc}") - raise Exception - finally: - if doc: - doc.cleanup_file() - return isd_elems_no_filename diff --git a/unstructured/ingest/ingest_doc_json_mixin.py b/unstructured/ingest/ingest_doc_json_mixin.py new file mode 100644 index 0000000000..7802f232f7 --- /dev/null +++ b/unstructured/ingest/ingest_doc_json_mixin.py @@ -0,0 +1,60 @@ +import json +import typing as t +from pathlib import Path + +from dataclasses_json import DataClassJsonMixin +from dataclasses_json.core import _ExtendedEncoder + + +class IngestDocJsonMixin(DataClassJsonMixin): + """ + Inherently, DataClassJsonMixin does not add in any @property fields to the json/dict + created from the dataclass. This explicitly sets properties to look for on the IngestDoc + class when creating the json/dict for serialization purposes. + """ + + properties_to_serialize = [ + "base_filename", + "date_created", + "date_modified", + "date_processed", + "exists", + "filename", + "_output_filename", + "record_locator", + "source_url", + "version", + ] + + def to_json( + self, + *, + skipkeys: bool = False, + ensure_ascii: bool = True, + check_circular: bool = True, + allow_nan: bool = True, + indent: t.Optional[t.Union[int, str]] = None, + separators: t.Optional[t.Tuple[str, str]] = None, + default: t.Optional[t.Callable] = None, + sort_keys: bool = False, + **kw, + ) -> str: + as_dict = self.to_dict(encode_json=False) + for prop in self.properties_to_serialize: + val = getattr(self, prop) + if isinstance(val, Path): + val = str(val) + as_dict[prop] = val + return json.dumps( + as_dict, + cls=_ExtendedEncoder, + skipkeys=skipkeys, + ensure_ascii=ensure_ascii, + check_circular=check_circular, + allow_nan=allow_nan, + indent=indent, + separators=separators, + default=default, + sort_keys=sort_keys, + **kw, + ) diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py index caefa50afd..6c219a6d0f 100644 --- a/unstructured/ingest/interfaces.py +++ b/unstructured/ingest/interfaces.py @@ -14,10 +14,11 @@ from dataclasses_json import DataClassJsonMixin from unstructured.chunking.title import chunk_by_title -from unstructured.documents.elements import DataSourceMetadata, Element -from unstructured.embed.interfaces import BaseEmbeddingEncoder +from unstructured.documents.elements import DataSourceMetadata +from unstructured.embed.interfaces import BaseEmbeddingEncoder, Element from unstructured.embed.openai import OpenAIEmbeddingEncoder from unstructured.ingest.error import PartitionError, SourceConnectionError +from unstructured.ingest.ingest_doc_json_mixin import IngestDocJsonMixin from unstructured.ingest.logger import logger from unstructured.partition.auto import partition from unstructured.staging.base import convert_to_dict, elements_from_json @@ -36,12 +37,8 @@ class BaseConfig(DataClassJsonMixin, ABC): @dataclass class PartitionConfig(BaseConfig): # where to write structured data outputs - output_dir: str = "structured-output" - num_processes: int = 2 - max_docs: t.Optional[int] = None pdf_infer_table_structure: bool = False strategy: str = "auto" - reprocess: bool = False ocr_languages: str = "eng" encoding: t.Optional[str] = None fields_include: t.List[str] = field( @@ -55,6 +52,15 @@ class PartitionConfig(BaseConfig): api_key: t.Optional[str] = None +@dataclass +class ProcessorConfig(BaseConfig): + reprocess: bool = False + verbose: bool = False + work_dir: str = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve()) + output_dir: str = "structured-output" + num_processes: int = 2 + + @dataclass class ReadConfig(BaseConfig): # where raw documents are stored for processing, and then removed if not preserve_downloads @@ -62,6 +68,7 @@ class ReadConfig(BaseConfig): re_download: bool = False preserve_downloads: bool = False download_only: bool = False + max_docs: t.Optional[int] = None @dataclass @@ -117,7 +124,7 @@ class SourceMetadata(DataClassJsonMixin, ABC): @dataclass -class BaseIngestDoc(DataClassJsonMixin, ABC): +class BaseIngestDoc(IngestDocJsonMixin, ABC): """An "ingest document" is specific to a connector, and provides methods to fetch a single raw document, store it locally for processing, any cleanup needed after successful processing of the doc, and the ability to write the doc's @@ -126,34 +133,33 @@ class BaseIngestDoc(DataClassJsonMixin, ABC): Crucially, it is not responsible for the actual processing of the raw document. """ + processor_config: ProcessorConfig read_config: ReadConfig - partition_config: PartitionConfig connector_config: BaseConnectorConfig - source_metadata: t.Optional[SourceMetadata] = field(init=False, default=None) - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._date_processed = None - - def run_chunking(self, elements: t.List[Element]) -> t.List[Element]: - return elements + _source_metadata: t.Optional[SourceMetadata] = field(init=False, default=None) + _date_processed: t.Optional[str] = field(init=False, default=None) @property - def embedder(self) -> t.Optional[BaseEmbeddingEncoder]: - return None + def source_metadata(self) -> SourceMetadata: + if self._source_metadata is None: + self.update_source_metadata() + # Provide guarantee that the field was set by update_source_metadata() + if self._source_metadata is None: + raise ValueError("failed to set source metadata") + return self._source_metadata + + @source_metadata.setter + def source_metadata(self, value: SourceMetadata): + self._source_metadata = value @property def date_created(self) -> t.Optional[str]: """The date the document was created on the source system.""" - if self.source_metadata is None: - self.update_source_metadata() return self.source_metadata.date_created # type: ignore @property def date_modified(self) -> t.Optional[str]: """The date the document was last modified on the source system.""" - if self.source_metadata is None: - self.update_source_metadata() return self.source_metadata.date_modified # type: ignore @property @@ -165,8 +171,6 @@ def date_processed(self) -> t.Optional[str]: @property def exists(self) -> t.Optional[bool]: """Whether the document exists on the remote source.""" - if self.source_metadata is None: - self.update_source_metadata() return self.source_metadata.exists # type: ignore @property @@ -174,6 +178,15 @@ def exists(self) -> t.Optional[bool]: def filename(self): """The local filename of the document after fetching from remote source.""" + @property + def base_filename(self) -> t.Optional[str]: + if self.read_config.download_dir and self.filename: + download_path = str(Path(self.read_config.download_dir).resolve()) + full_path = str(self.filename) + base_path = full_path.replace(download_path, "") + return base_path + return None + @property @abstractmethod def _output_filename(self): @@ -188,8 +201,6 @@ def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]: # Values must be JS @property def source_url(self) -> t.Optional[str]: """The url of the source document.""" - if self.source_metadata is None: - self.update_source_metadata() return self.source_metadata.source_url # type: ignore @property @@ -197,8 +208,6 @@ def version(self) -> t.Optional[str]: """The version of the source document, this could be the last modified date, an explicit version number, or anything else that can be used to uniquely identify the version of the document.""" - if self.source_metadata is None: - self.update_source_metadata() return self.source_metadata.version # type: ignore @abstractmethod @@ -227,7 +236,7 @@ def wrapper(self, *args, **kwargs): # TODO: set as @abstractmethod and pass or raise NotImplementedError def update_source_metadata(self, **kwargs) -> None: """Sets the SourceMetadata and the properties for the doc""" - self.source_metadata = SourceMetadata() + self._source_metadata = SourceMetadata() # NOTE(crag): Future BaseIngestDoc classes could define get_file_object() methods # in addition to or instead of get_file() @@ -241,18 +250,13 @@ def has_output(self) -> bool: """Determine if structured output for this doc already exists.""" return self._output_filename.is_file() and self._output_filename.stat().st_size - def write_result(self): - """Write the structured json result for this doc. result must be json serializable.""" - if self.read_config.download_only: - return - self._output_filename.parent.mkdir(parents=True, exist_ok=True) - with open(self._output_filename, "w", encoding="utf8") as output_f: - json.dump(self.isd_elems_no_filename, output_f, ensure_ascii=False, indent=2) - logger.info(f"Wrote {self._output_filename}") - @PartitionError.wrap - def partition_file(self, **partition_kwargs) -> t.List[t.Dict[str, t.Any]]: - if not self.partition_config.partition_by_api: + def partition_file( + self, + partition_config: PartitionConfig, + **partition_kwargs, + ) -> t.List[Element]: + if not partition_config.partition_by_api: logger.debug("Using local partition") elements = partition( filename=str(self.filename), @@ -267,14 +271,14 @@ def partition_file(self, **partition_kwargs) -> t.List[t.Dict[str, t.Any]]: **partition_kwargs, ) else: - endpoint = self.partition_config.partition_endpoint + endpoint = partition_config.partition_endpoint logger.debug(f"Using remote partition ({endpoint})") with open(self.filename, "rb") as f: headers_dict = {} - if self.partition_config.api_key: - headers_dict["UNSTRUCTURED-API-KEY"] = self.partition_config.api_key + if partition_config.api_key: + headers_dict["UNSTRUCTURED-API-KEY"] = partition_config.api_key response = requests.post( f"{endpoint}", files={"files": (str(self.filename), f)}, @@ -286,30 +290,31 @@ def partition_file(self, **partition_kwargs) -> t.List[t.Dict[str, t.Any]]: if response.status_code != 200: raise RuntimeError(f"Caught {response.status_code} from API: {response.text}") elements = elements_from_json(text=json.dumps(response.json())) - elements = self.run_chunking(elements=elements) - if self.embedder: - logger.info("Running embedder to add vector content to elements") - elements = self.embedder.embed_documents(elements) - return convert_to_dict(elements) + return elements - def process_file(self, **partition_kwargs) -> t.Optional[t.List[t.Dict[str, t.Any]]]: + def process_file( + self, + partition_config: PartitionConfig, + **partition_kwargs, + ) -> t.Optional[t.List[t.Dict[str, t.Any]]]: self._date_processed = datetime.utcnow().isoformat() if self.read_config.download_only: return None logger.info(f"Processing {self.filename}") - isd_elems = self.partition_file(**partition_kwargs) + isd_elems_raw = self.partition_file(partition_config=partition_config, **partition_kwargs) + isd_elems = convert_to_dict(isd_elems_raw) self.isd_elems_no_filename: t.List[t.Dict[str, t.Any]] = [] for elem in isd_elems: # type: ignore - if self.partition_config.metadata_exclude and self.partition_config.metadata_include: + if partition_config.metadata_exclude and partition_config.metadata_include: raise ValueError( "Arguments `--metadata-include` and `--metadata-exclude` are " "mutually exclusive with each other.", ) - elif self.partition_config.metadata_exclude: - ex_list = self.partition_config.metadata_exclude + elif partition_config.metadata_exclude: + ex_list = partition_config.metadata_exclude for ex in ex_list: if "." in ex: # handle nested fields nested_fields = ex.split(".") @@ -322,15 +327,15 @@ def process_file(self, **partition_kwargs) -> t.Optional[t.List[t.Dict[str, t.An current_elem.pop(field_to_exclude, None) else: # handle top-level fields elem["metadata"].pop(ex, None) # type: ignore[attr-defined] - elif self.partition_config.metadata_include: - in_list = self.partition_config.metadata_include + elif partition_config.metadata_include: + in_list = partition_config.metadata_include for k in list(elem["metadata"].keys()): # type: ignore[attr-defined] if k not in in_list: elem["metadata"].pop(k, None) # type: ignore[attr-defined] - in_list = self.partition_config.fields_include + in_list = partition_config.fields_include elem = {k: v for k, v in elem.items() if k in in_list} - if self.partition_config.flatten_metadata: + if partition_config.flatten_metadata: for k, v in elem["metadata"].items(): # type: ignore[attr-defined] elem[k] = v elem.pop("metadata") # type: ignore[attr-defined] @@ -344,21 +349,9 @@ def process_file(self, **partition_kwargs) -> t.Optional[t.List[t.Dict[str, t.An class BaseSourceConnector(DataClassJsonMixin, ABC): """Abstract Base Class for a connector to a remote source, e.g. S3 or Google Drive.""" + processor_config: ProcessorConfig read_config: ReadConfig connector_config: BaseConnectorConfig - partition_config: PartitionConfig - - def __init__( - self, - read_config: ReadConfig, - connector_config: BaseConnectorConfig, - partition_config: PartitionConfig, - ): - """Expects a standard_config object that implements StandardConnectorConfig - and config object that implements BaseConnectorConfig.""" - self.read_config = read_config - self.connector_config = connector_config - self.partition_config = partition_config @abstractmethod def cleanup(self, cur_dir=None): diff --git a/unstructured/ingest/pipeline/__init__.py b/unstructured/ingest/pipeline/__init__.py new file mode 100644 index 0000000000..19d78bdbcd --- /dev/null +++ b/unstructured/ingest/pipeline/__init__.py @@ -0,0 +1,20 @@ +from .doc_factory import DocFactory +from .interfaces import PipelineContext, ReformatNode +from .partition import Partitioner +from .pipeline import Pipeline +from .reformat.chunking import Chunker +from .reformat.embedding import Embedder +from .source import Reader +from .write import Writer + +__all__ = [ + "DocFactory", + "Partitioner", + "Reader", + "Embedder", + "PipelineContext", + "Pipeline", + "Writer", + "Chunker", + "ReformatNode", +] diff --git a/unstructured/ingest/pipeline/copy.py b/unstructured/ingest/pipeline/copy.py new file mode 100644 index 0000000000..446d126080 --- /dev/null +++ b/unstructured/ingest/pipeline/copy.py @@ -0,0 +1,19 @@ +import os +import shutil +from pathlib import Path + +from unstructured.ingest.connector.registry import create_ingest_doc_from_json +from unstructured.ingest.logger import logger +from unstructured.ingest.pipeline.interfaces import CopyNode + + +class Copier(CopyNode): + def run(self, json_path: str): + filename = os.path.basename(json_path) + doc_hash = os.path.splitext(filename)[0] + ingest_doc_json = self.pipeline_context.ingest_docs_map[doc_hash] + ingest_doc = create_ingest_doc_from_json(ingest_doc_json) + desired_output = ingest_doc._output_filename + Path(desired_output).parent.mkdir(parents=True, exist_ok=True) + logger.info(f"Copying {json_path} -> {desired_output}") + shutil.copy(json_path, desired_output) diff --git a/unstructured/ingest/pipeline/doc_factory.py b/unstructured/ingest/pipeline/doc_factory.py new file mode 100644 index 0000000000..e4a1598f8e --- /dev/null +++ b/unstructured/ingest/pipeline/doc_factory.py @@ -0,0 +1,15 @@ +import typing as t +from dataclasses import dataclass + +from unstructured.ingest.pipeline.interfaces import DocFactoryNode + + +@dataclass +class DocFactory(DocFactoryNode): + def initialize(self): + self.source_doc_connector.initialize() + + def run(self, *args, **kwargs) -> t.Iterable[str]: + docs = self.source_doc_connector.get_ingest_docs() + json_docs = [doc.to_json() for doc in docs] + return json_docs diff --git a/unstructured/ingest/pipeline/initialize.py b/unstructured/ingest/pipeline/initialize.py new file mode 100644 index 0000000000..eb006620ea --- /dev/null +++ b/unstructured/ingest/pipeline/initialize.py @@ -0,0 +1,16 @@ +import os + +from unstructured_inference.models.base import get_model + + +def initialize(): + """Download default model or model specified by UNSTRUCTURED_HI_RES_MODEL_NAME environment + variable (avoids subprocesses all doing the same)""" + + # If more than one model will be supported and left up to user selection + supported_model = os.environ.get("UNSTRUCTURED_HI_RES_SUPPORTED_MODEL", "") + if supported_model: + for model_name in supported_model.split(","): + get_model(model_name=model_name) + + get_model(os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME")) diff --git a/unstructured/ingest/pipeline/interfaces.py b/unstructured/ingest/pipeline/interfaces.py new file mode 100644 index 0000000000..ef331b3247 --- /dev/null +++ b/unstructured/ingest/pipeline/interfaces.py @@ -0,0 +1,204 @@ +import hashlib +import json +import logging +import multiprocessing as mp +import typing as t +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from multiprocessing.managers import DictProxy +from pathlib import Path + +from dataclasses_json import DataClassJsonMixin + +from unstructured.ingest.interfaces import ( + BaseDestinationConnector, + BaseSourceConnector, + PartitionConfig, + ProcessorConfig, +) +from unstructured.ingest.logger import ingest_log_streaming_init, logger + + +@dataclass +class PipelineContext(ProcessorConfig): + """ + Data that gets shared across each pipeline node + """ + + def __post_init__(self): + self._ingest_docs_map: t.Optional[DictProxy] = None + + @property + def ingest_docs_map(self) -> DictProxy: + if self._ingest_docs_map is None: + raise ValueError("ingest_docs_map never initialized") + return self._ingest_docs_map + + @ingest_docs_map.setter + def ingest_docs_map(self, value: DictProxy): + self._ingest_docs_map = value + + +@dataclass +class PipelineNode(DataClassJsonMixin, ABC): + """ + Class that encapsulates logic to run during a single pipeline step + """ + + pipeline_context: PipelineContext + + def __call__(self, iterable: t.Optional[t.Iterable[t.Any]] = None) -> t.Any: + iterable = iterable if iterable else [] + self.initialize() + if not self.supported_multiprocessing(): + if iterable: + self.result = self.run(iterable) + else: + self.result = self.run() + elif self.pipeline_context.num_processes == 1: + if iterable: + self.result = [self.run(it) for it in iterable] + else: + self.result = self.run() + else: + with mp.Pool( + processes=self.pipeline_context.num_processes, + initializer=ingest_log_streaming_init, + initargs=(logging.DEBUG if self.pipeline_context.verbose else logging.INFO,), + ) as pool: + self.result = pool.map(self.run, iterable) + return self.result + + def supported_multiprocessing(self) -> bool: + return True + + @abstractmethod + def run(self, *args, **kwargs) -> t.Optional[t.Any]: + pass + + def initialize(self): + if path := self.get_path(): + logger.info(f"Creating {path}") + path.mkdir(parents=True, exist_ok=True) + + def get_path(self) -> t.Optional[Path]: + return None + + +@dataclass +class DocFactoryNode(PipelineNode): + """ + Encapsulated logic to generate a list of ingest docs + """ + + source_doc_connector: BaseSourceConnector + + def initialize(self): + logger.info( + f"Running doc factory to generate ingest docs. " + f"Source connector: {self.source_doc_connector.to_json()}", + ) + super().initialize() + self.source_doc_connector.initialize() + + @abstractmethod + def run(self, *args, **kwargs) -> t.Iterable[str]: + pass + + def supported_multiprocessing(self) -> bool: + return False + + +@dataclass +class SourceNode(PipelineNode): + """ + Encapsulated logic to pull from a data source via base ingest docs + Output of logic expected to be the json outputs of the data itself + """ + + def initialize(self): + logger.info("Running source node to download data associated with ingest docs") + super().initialize() + + @abstractmethod + def run(self, ingest_doc_json: str) -> str: + pass + + +@dataclass +class PartitionNode(PipelineNode): + """ + Encapsulates logic to run partition on the json files as the output of the source node + """ + + partition_config: PartitionConfig + partition_kwargs: dict = field(default_factory=dict) + + def initialize(self): + logger.info( + f"Running partition node to extract content from json files. " + f"Config: {self.partition_config.to_json()}, " + f"partition kwargs: {json.dumps(self.partition_kwargs)}]", + ) + super().initialize() + + def create_hash(self) -> str: + hash_dict = self.partition_config.to_dict() + hash_dict["partition_kwargs"] = self.partition_kwargs + return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32] + + @abstractmethod + def run(self, json_path: str) -> str: + pass + + def get_path(self) -> Path: + return (Path(self.pipeline_context.work_dir) / "partitioned").resolve() + + +@dataclass +class ReformatNode(PipelineNode, ABC): + """ + Encapsulated any logic to reformat the output List[Element] + content from partition before writing it + """ + + pass + + +@dataclass +class WriteNode(PipelineNode): + """ + Encapsulated logic to write the final result to a downstream data connection + """ + + dest_doc_connector: BaseDestinationConnector + + @abstractmethod + def run(self, json_paths: t.List[str]): + pass + + def initialize(self): + logger.info( + f"Running write node to upload content. " + f"Destination connector: {self.dest_doc_connector.to_json()}]", + ) + super().initialize() + self.dest_doc_connector.initialize() + + def supported_multiprocessing(self) -> bool: + return False + + +@dataclass +class CopyNode(PipelineNode): + """ + Encapsulated logic to copy the final result of the pipeline to the designated output location. + """ + + def initialize(self): + logger.info("Running copy node to move content to desired output location") + super().initialize() + + @abstractmethod + def run(self, json_path: str): + pass diff --git a/unstructured/ingest/pipeline/partition.py b/unstructured/ingest/pipeline/partition.py new file mode 100644 index 0000000000..b3f39ccb90 --- /dev/null +++ b/unstructured/ingest/pipeline/partition.py @@ -0,0 +1,43 @@ +import hashlib +import json +from dataclasses import dataclass +from pathlib import Path + +from unstructured.ingest.connector.registry import create_ingest_doc_from_json +from unstructured.ingest.error import PartitionError +from unstructured.ingest.logger import logger +from unstructured.ingest.pipeline.interfaces import PartitionNode +from unstructured.ingest.pipeline.utils import get_ingest_doc_hash + + +@dataclass +class Partitioner(PartitionNode): + @PartitionError.wrap + def run(self, ingest_doc_json) -> str: + doc = create_ingest_doc_from_json(ingest_doc_json) + doc_filename_hash = get_ingest_doc_hash(ingest_doc_json) + hashed_filename = hashlib.sha256( + f"{self.create_hash()}{doc_filename_hash}".encode(), + ).hexdigest()[:32] + self.pipeline_context.ingest_docs_map[hashed_filename] = ingest_doc_json + doc_filename = f"{hashed_filename}.json" + json_path = (Path(self.get_path()) / doc_filename).resolve() + if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size: + logger.info(f"File exists: {json_path}, skipping partition") + return str(json_path) + languages = ( + self.partition_config.ocr_languages.split("+") + if self.partition_config.ocr_languages + else [] + ) + elements = doc.process_file( + partition_config=self.partition_config, + strategy=self.partition_config.strategy, + languages=languages, + encoding=self.partition_config.encoding, + pdf_infer_table_structure=self.partition_config.pdf_infer_table_structure, + ) + with open(json_path, "w", encoding="utf8") as output_f: + logger.info(f"writing partitioned content to {json_path}") + json.dump(elements, output_f, ensure_ascii=False, indent=2) + return str(json_path) diff --git a/unstructured/ingest/pipeline/pipeline.py b/unstructured/ingest/pipeline/pipeline.py new file mode 100644 index 0000000000..d7952efe08 --- /dev/null +++ b/unstructured/ingest/pipeline/pipeline.py @@ -0,0 +1,69 @@ +import logging +import multiprocessing as mp +import typing as t +from dataclasses import dataclass, field + +from dataclasses_json import DataClassJsonMixin + +from unstructured.ingest.logger import ingest_log_streaming_init, logger +from unstructured.ingest.pipeline.copy import Copier +from unstructured.ingest.pipeline.interfaces import ( + DocFactoryNode, + PartitionNode, + PipelineContext, + ReformatNode, + SourceNode, + WriteNode, +) +from unstructured.ingest.pipeline.utils import get_ingest_doc_hash + + +@dataclass +class Pipeline(DataClassJsonMixin): + pipeline_context: PipelineContext + doc_factory_node: DocFactoryNode + source_node: SourceNode + partition_node: PartitionNode + write_node: t.Optional[WriteNode] = None + reformat_nodes: t.List[ReformatNode] = field(default_factory=list) + + def initialize(self): + ingest_log_streaming_init(logging.DEBUG if self.pipeline_context.verbose else logging.INFO) + + def get_nodes_str(self): + nodes = [self.doc_factory_node, self.source_node, self.partition_node] + nodes.extend(self.reformat_nodes) + if self.write_node: + nodes.append(self.write_node) + nodes.append(Copier(pipeline_context=self.pipeline_context)) + return " -> ".join([node.__class__.__name__ for node in nodes]) + + def run(self): + logger.info( + f"running pipeline: {self.get_nodes_str()} " + f"with config: {self.pipeline_context.to_json()}", + ) + self.initialize() + manager = mp.Manager() + self.pipeline_context.ingest_docs_map = manager.dict() + json_docs = self.doc_factory_node() + logger.info( + f"processing {len(json_docs)} docs via " + f"{self.pipeline_context.num_processes} processes", + ) + for doc in json_docs: + self.pipeline_context.ingest_docs_map[get_ingest_doc_hash(doc)] = doc + self.source_node(iterable=json_docs) + partitioned_jsons = self.partition_node(iterable=json_docs) + for reformat_node in self.reformat_nodes: + reformatted_jsons = reformat_node(iterable=partitioned_jsons) + partitioned_jsons = reformatted_jsons + + # Copy the final destination to the desired location + copier = Copier( + pipeline_context=self.pipeline_context, + ) + copier(iterable=partitioned_jsons) + + if self.write_node: + self.write_node(iterable=partitioned_jsons) diff --git a/unstructured/ingest/pipeline/reformat/chunking.py b/unstructured/ingest/pipeline/reformat/chunking.py new file mode 100644 index 0000000000..99acbff230 --- /dev/null +++ b/unstructured/ingest/pipeline/reformat/chunking.py @@ -0,0 +1,53 @@ +import hashlib +import json +import os.path +from dataclasses import dataclass +from pathlib import Path + +from unstructured.ingest.interfaces import ( + ChunkingConfig, +) +from unstructured.ingest.logger import logger +from unstructured.ingest.pipeline.interfaces import ReformatNode +from unstructured.staging.base import convert_to_dict, elements_from_json + + +@dataclass +class Chunker(ReformatNode): + chunking_config: ChunkingConfig + + def initialize(self): + logger.info( + f"Running chunking node. Chunking config: {self.chunking_config.to_json()}]", + ) + super().initialize() + + def create_hash(self) -> str: + hash_dict = self.chunking_config.to_dict() + return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32] + + def run(self, elements_json: str) -> str: + elements_json_filename = os.path.basename(elements_json) + filename_ext = os.path.basename(elements_json_filename) + filename = os.path.splitext(filename_ext)[0] + hashed_filename = hashlib.sha256(f"{self.create_hash()}{filename}".encode()).hexdigest()[ + :32 + ] + json_filename = f"{hashed_filename}.json" + json_path = (Path(self.get_path()) / json_filename).resolve() + self.pipeline_context.ingest_docs_map[ + hashed_filename + ] = self.pipeline_context.ingest_docs_map[filename] + if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size: + logger.debug(f"File exists: {json_path}, skipping embedding") + return str(json_path) + elements = elements_from_json(filename=elements_json) + chunked_elements = self.chunking_config.chunk(elements=elements) + elements_dict = convert_to_dict(chunked_elements) + with open(json_path, "w", encoding="utf8") as output_f: + logger.info(f"writing embeddings content to {json_path}") + json.dump(elements_dict, output_f, ensure_ascii=False, indent=2) + return str(json_path) + + def get_path(self) -> Path: + return (Path(self.pipeline_context.work_dir) / "chunked").resolve() diff --git a/unstructured/ingest/pipeline/reformat/embedding.py b/unstructured/ingest/pipeline/reformat/embedding.py new file mode 100644 index 0000000000..ae723b4797 --- /dev/null +++ b/unstructured/ingest/pipeline/reformat/embedding.py @@ -0,0 +1,51 @@ +import hashlib +import json +import os.path +from dataclasses import dataclass +from pathlib import Path + +from unstructured.ingest.interfaces import ( + EmbeddingConfig, +) +from unstructured.ingest.logger import logger +from unstructured.ingest.pipeline.interfaces import ReformatNode +from unstructured.staging.base import convert_to_dict, elements_from_json + + +@dataclass +class Embedder(ReformatNode): + embedder_config: EmbeddingConfig + + def initialize(self): + logger.info( + f"Running embedding node. Embedding config: {self.embedder_config.to_json()}]", + ) + super().initialize() + + def create_hash(self) -> str: + hash_dict = self.embedder_config.to_dict() + return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32] + + def run(self, elements_json: str) -> str: + elements_json_filename = os.path.basename(elements_json) + filename_ext = os.path.basename(elements_json_filename) + filename = os.path.splitext(filename_ext)[0] + hashed_filename = hashlib.sha256(f"{self.create_hash()}{filename}".encode()).hexdigest()[ + :32 + ] + json_filename = f"{hashed_filename}.json" + json_path = (Path(self.get_path()) / json_filename).resolve() + if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size: + logger.debug(f"File exists: {json_path}, skipping embedding") + return str(json_path) + elements = elements_from_json(filename=elements_json) + embedder = self.embedder_config.get_embedder() + embedded_elements = embedder.embed_documents(elements=elements) + elements_dict = convert_to_dict(embedded_elements) + with open(json_path, "w", encoding="utf8") as output_f: + logger.info(f"writing embeddings content to {json_path}") + json.dump(elements_dict, output_f, ensure_ascii=False, indent=2) + return str(json_path) + + def get_path(self) -> Path: + return (Path(self.pipeline_context.work_dir) / "embedded").resolve() diff --git a/unstructured/ingest/pipeline/source.py b/unstructured/ingest/pipeline/source.py new file mode 100644 index 0000000000..083cd22aaa --- /dev/null +++ b/unstructured/ingest/pipeline/source.py @@ -0,0 +1,27 @@ +import typing as t +from dataclasses import dataclass + +from unstructured.ingest.connector.registry import create_ingest_doc_from_json +from unstructured.ingest.interfaces import BaseSessionHandle, IngestDocSessionHandleMixin +from unstructured.ingest.pipeline.interfaces import SourceNode + +# module-level variable to store session handle +session_handle: t.Optional[BaseSessionHandle] = None + + +@dataclass +class Reader(SourceNode): + def run(self, ingest_doc_json: str) -> str: + global session_handle + doc = create_ingest_doc_from_json(ingest_doc_json) + if isinstance(doc, IngestDocSessionHandleMixin): + if session_handle is None: + # create via doc.session_handle, which is a property that creates a + # session handle if one is not already defined + session_handle = doc.session_handle + else: + doc.session_handle = session_handle + # does the work necessary to load file into filesystem + # in the future, get_file_handle() could also be supported + doc.get_file() + return doc.filename diff --git a/unstructured/ingest/pipeline/utils.py b/unstructured/ingest/pipeline/utils.py new file mode 100644 index 0000000000..f172fb9f2c --- /dev/null +++ b/unstructured/ingest/pipeline/utils.py @@ -0,0 +1,8 @@ +import hashlib +import json + + +def get_ingest_doc_hash(doc: str) -> str: + json_as_dict = json.loads(doc) + hashed = hashlib.sha256(json_as_dict.get("filename").encode()).hexdigest()[:32] + return hashed diff --git a/unstructured/ingest/pipeline/write.py b/unstructured/ingest/pipeline/write.py new file mode 100644 index 0000000000..b9300ea3b6 --- /dev/null +++ b/unstructured/ingest/pipeline/write.py @@ -0,0 +1,18 @@ +import os.path +import typing as t +from dataclasses import dataclass + +from unstructured.ingest.connector.registry import create_ingest_doc_from_json +from unstructured.ingest.pipeline.interfaces import WriteNode + + +@dataclass +class Writer(WriteNode): + def run(self, json_paths: t.List[str]): + ingest_docs = [] + for json_path in json_paths: + filename = os.path.basename(json_path) + doc_hash = os.path.splitext(filename)[0] + ingest_doc_json = self.pipeline_context.ingest_docs_map[doc_hash] + ingest_docs.append(create_ingest_doc_from_json(ingest_doc_json)) + self.dest_doc_connector.write(docs=ingest_docs) diff --git a/unstructured/ingest/processor.py b/unstructured/ingest/processor.py index a91fc671b5..8d9cd7c3e3 100644 --- a/unstructured/ingest/processor.py +++ b/unstructured/ingest/processor.py @@ -1,127 +1,75 @@ -import logging import multiprocessing as mp import typing as t from contextlib import suppress -from functools import partial -from unstructured.ingest.doc_processor.generalized import initialize, process_document from unstructured.ingest.interfaces import ( BaseDestinationConnector, BaseSourceConnector, + ChunkingConfig, + EmbeddingConfig, PartitionConfig, + ProcessorConfig, +) +from unstructured.ingest.pipeline import ( + Chunker, + DocFactory, + Embedder, + Partitioner, + Pipeline, + PipelineContext, + Reader, + ReformatNode, + Writer, ) -from unstructured.ingest.logger import ingest_log_streaming_init, logger with suppress(RuntimeError): mp.set_start_method("spawn") -class Processor: - def __init__( - self, - source_doc_connector: BaseSourceConnector, - doc_processor_fn, - num_processes: int, - reprocess: bool, - verbose: bool, - max_docs: t.Optional[int], - dest_doc_connector: t.Optional[BaseDestinationConnector] = None, - ): - # initialize the reader and writer - self.source_doc_connector = source_doc_connector - self.doc_processor_fn = doc_processor_fn - self.num_processes = num_processes - self.reprocess = reprocess - self.verbose = verbose - self.max_docs = max_docs - self.dest_doc_connector = dest_doc_connector - - def initialize(self): - """Slower initialization things: check connections, load things into memory, etc.""" - ingest_log_streaming_init(logging.DEBUG if self.verbose else logging.INFO) - self.source_doc_connector.initialize() - if self.dest_doc_connector: - self.dest_doc_connector.initialize() - initialize() - - def cleanup(self): - self.source_doc_connector.cleanup() - - def _filter_docs_with_outputs(self, docs): - num_docs_all = len(docs) - docs = [doc for doc in docs if not doc.has_output()] - if self.max_docs is not None: - if num_docs_all > self.max_docs: - num_docs_all = self.max_docs - docs = docs[: self.max_docs] - num_docs_to_process = len(docs) - if num_docs_to_process == 0: - logger.info( - "All docs have structured outputs, nothing to do. Use --reprocess to process all.", - ) - return None - elif num_docs_to_process != num_docs_all: - logger.info( - f"Skipping processing for {num_docs_all - num_docs_to_process} docs out of " - f"{num_docs_all} since their structured outputs already exist, use --reprocess to " - "reprocess those in addition to the unprocessed ones.", - ) - return docs - - def run_partition(self, docs): - if not self.reprocess: - docs = self._filter_docs_with_outputs(docs) - if not docs: - return - - # Debugging tip: use the below lines and comment out the mp.Pool loop - # block to remain in single process - # json_docs = [doc.to_json() for doc in docs] - # self.doc_processor_fn(json_docs[0]) - logger.info(f"Processing {len(docs)} docs") - json_docs = [doc.to_json() for doc in docs] - with mp.Pool( - processes=self.num_processes, - initializer=ingest_log_streaming_init, - initargs=(logging.DEBUG if self.verbose else logging.INFO,), - ) as pool: - pool.map(self.doc_processor_fn, json_docs) - - def run(self): - self.initialize() - - # fetch the list of lazy downloading IngestDoc obj's - docs = self.source_doc_connector.get_ingest_docs() - - try: - self.run_partition(docs=docs) - if self.dest_doc_connector: - self.dest_doc_connector.write(docs=docs) - finally: - self.cleanup() - - def process_documents( + processor_config: ProcessorConfig, source_doc_connector: BaseSourceConnector, partition_config: PartitionConfig, - verbose: bool, dest_doc_connector: t.Optional[BaseDestinationConnector] = None, + chunking_config: t.Optional[ChunkingConfig] = None, + embedder_config: t.Optional[EmbeddingConfig] = None, ) -> None: - languages = partition_config.ocr_languages.split("+") if partition_config.ocr_languages else [] - process_document_with_partition_args = partial( - process_document, - strategy=partition_config.strategy, - languages=languages, - encoding=partition_config.encoding, - pdf_infer_table_structure=partition_config.pdf_infer_table_structure, - ) - - Processor( + pipeline_config = PipelineContext.from_dict(processor_config.to_dict()) + doc_factory = DocFactory( + pipeline_context=pipeline_config, source_doc_connector=source_doc_connector, - doc_processor_fn=process_document_with_partition_args, - num_processes=partition_config.num_processes, - reprocess=partition_config.reprocess, - verbose=verbose, - max_docs=partition_config.max_docs, - dest_doc_connector=dest_doc_connector, - ).run() + ) + reader = Reader(pipeline_context=pipeline_config) + partitioner = Partitioner(pipeline_context=pipeline_config, partition_config=partition_config) + reformat_nodes: t.List[ReformatNode] = [] + if embedder_config: + reformat_nodes.append( + Embedder( + pipeline_context=pipeline_config, + embedder_config=embedder_config, + ), + ) + if chunking_config: + reformat_nodes.append( + Chunker( + pipeline_context=pipeline_config, + chunking_config=chunking_config, + ), + ) + writer = ( + Writer( + pipeline_context=pipeline_config, + dest_doc_connector=dest_doc_connector, + ) + if dest_doc_connector + else None + ) + pipeline = Pipeline( + pipeline_context=pipeline_config, + doc_factory_node=doc_factory, + source_node=reader, + partition_node=partitioner, + reformat_nodes=reformat_nodes, + write_node=writer, + ) + pipeline.run() diff --git a/unstructured/ingest/runner/__init__.py b/unstructured/ingest/runner/__init__.py index ef521f280c..f68a8dbdb9 100644 --- a/unstructured/ingest/runner/__init__.py +++ b/unstructured/ingest/runner/__init__.py @@ -1,85 +1,85 @@ import typing as t -from .airtable import airtable -from .azure import azure -from .biomed import biomed -from .box import box -from .confluence import confluence -from .delta_table import delta_table -from .discord import discord -from .dropbox import dropbox -from .elasticsearch import elasticsearch -from .fsspec import fsspec -from .gcs import gcs -from .github import github -from .gitlab import gitlab -from .google_drive import gdrive -from .jira import jira -from .local import local -from .notion import notion -from .onedrive import onedrive -from .outlook import outlook -from .reddit import reddit -from .s3 import s3 -from .salesforce import salesforce -from .sharepoint import SharePoint -from .slack import slack -from .wikipedia import wikipedia +from .airtable import AirtableRunner +from .azure import AzureRunner +from .biomed import BiomedRunner +from .box import BoxRunner +from .confluence import ConfluenceRunner +from .delta_table import DeltaTableRunner +from .discord import DiscordRunner +from .dropbox import DropboxRunner +from .elasticsearch import ElasticSearchRunner +from .fsspec import FsspecRunner +from .gcs import GCSRunner +from .github import GithubRunner +from .gitlab import GitlabRunner +from .google_drive import GoogleDriveRunner +from .jira import JiraRunner +from .local import LocalRunner +from .notion import NotionRunner +from .onedrive import OneDriveRunner +from .outlook import OutlookRunner +from .reddit import RedditRunner +from .s3 import S3Runner +from .salesforce import SalesforceRunner +from .sharepoint import SharePointRunner +from .slack import SlackRunner +from .wikipedia import WikipediaRunner runner_map: t.Dict[str, t.Callable] = { - "airtable": airtable, - "azure": azure, - "biomed": biomed, - "box": box, - "confluence": confluence, - "delta_table": delta_table, - "discord": discord, - "dropbox": dropbox, - "elasticsearch": elasticsearch, - "fsspec": fsspec, - "gcs": gcs, - "github": github, - "gitlab": gitlab, - "gdrive": gdrive, - "google_drive": gdrive, - "jira": jira, - "local": local, - "notion": notion, - "onedrive": onedrive, - "outlook": outlook, - "reddit": reddit, - "s3": s3, - "salesforce": salesforce, - "sharepoint": SharePoint, - "slack": slack, - "wikipedia": wikipedia, + "airtable": AirtableRunner, + "azure": AzureRunner, + "biomed": BiomedRunner, + "box": BoxRunner, + "confluence": ConfluenceRunner, + "delta_table": DeltaTableRunner, + "discord": DiscordRunner, + "dropbox": DropboxRunner, + "elasticsearch": ElasticSearchRunner, + "fsspec": FsspecRunner, + "gcs": GCSRunner, + "github": GithubRunner, + "gitlab": GitlabRunner, + "gdrive": GoogleDriveRunner, + "google_drive": GoogleDriveRunner, + "jira": JiraRunner, + "local": LocalRunner, + "notion": NotionRunner, + "onedrive": OneDriveRunner, + "outlook": OutlookRunner, + "reddit": RedditRunner, + "s3": S3Runner, + "salesforce": SalesforceRunner, + "sharepoint": SharePointRunner, + "slack": SlackRunner, + "wikipedia": WikipediaRunner, } __all__ = [ - "airtable", - "azure", - "biomed", - "box", - "confluence", - "delta_table", - "discord", - "dropbox", - "elasticsearch", - "fsspec", - "gcs", - "gdrive", - "github", - "gitlab", - "jira", - "local", - "notion", - "onedrive", - "outlook", - "reddit", - "s3", - "salesforce", - "SharePoint", - "slack", - "wikipedia", + "AirtableRunner", + "AzureRunner", + "BiomedRunner", + "BoxRunner", + "ConfluenceRunner", + "DeltaTableRunner", + "DiscordRunner", + "DropboxRunner", + "ElasticSearchRunner", + "FsspecRunner", + "GCSRunner", + "GoogleDriveRunner", + "GithubRunner", + "GitlabRunner", + "JiraRunner", + "LocalRunner", + "NotionRunner", + "OneDriveRunner", + "OutlookRunner", + "RedditRunner", + "S3Runner", + "SalesforceRunner", + "SharePointRunner", + "SlackRunner", + "WikipediaRunner", "runner_map", ] diff --git a/unstructured/ingest/runner/airtable.py b/unstructured/ingest/runner/airtable.py index 48fc109b6a..c018e1c117 100644 --- a/unstructured/ingest/runner/airtable.py +++ b/unstructured/ingest/runner/airtable.py @@ -2,59 +2,45 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_hash -from unstructured.ingest.runner.writers import writer_map - - -def airtable( - read_config: ReadConfig, - partition_config: PartitionConfig, - personal_access_token: str, - verbose: bool = False, - list_of_paths: t.Optional[str] = None, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - personal_access_token.encode("utf-8"), - ) - - read_config.download_dir = update_download_dir_hash( - connector_name="airtable", - read_config=read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.airtable import ( - AirtableSourceConnector, - SimpleAirtableConfig, - ) - - source_doc_connector = AirtableSourceConnector( # type: ignore - connector_config=SimpleAirtableConfig( - personal_access_token=personal_access_token, - list_of_paths=list_of_paths, - ), - read_config=read_config, - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) + + +class AirtableRunner(Runner): + def run( + self, + personal_access_token: str, + list_of_paths: t.Optional[str] = None, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + hashed_dir_name = hashlib.sha256( + personal_access_token.encode("utf-8"), + ) + + self.read_config.download_dir = update_download_dir_hash( + connector_name="airtable", + read_config=self.read_config, + hashed_dir_name=hashed_dir_name, + logger=logger, + ) + + from unstructured.ingest.connector.airtable import ( + AirtableSourceConnector, + SimpleAirtableConfig, + ) + + source_doc_connector = AirtableSourceConnector( # type: ignore + processor_config=self.processor_config, + connector_config=SimpleAirtableConfig( + personal_access_token=personal_access_token, + list_of_paths=list_of_paths, + ), + read_config=self.read_config, + ) + + self.process_documents( + source_doc_connector=source_doc_connector, + ) diff --git a/unstructured/ingest/runner/azure.py b/unstructured/ingest/runner/azure.py index 58e2594b4d..c797cbc888 100644 --- a/unstructured/ingest/runner/azure.py +++ b/unstructured/ingest/runner/azure.py @@ -1,73 +1,59 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_remote_url -from unstructured.ingest.runner.writers import writer_map -def azure( - read_config: ReadConfig, - partition_config: PartitionConfig, - account_name: t.Optional[str], - account_key: t.Optional[str], - connection_string: t.Optional[str], - remote_url: str, - verbose: bool = False, - recursive: bool = False, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - if not account_name and not connection_string: - raise ValueError( - "missing either account-name or connection-string", +class AzureRunner(Runner): + def run( + self, + account_name: t.Optional[str], + account_key: t.Optional[str], + connection_string: t.Optional[str], + remote_url: str, + recursive: bool = False, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + if not account_name and not connection_string: + raise ValueError( + "missing either account-name or connection-string", + ) + + self.read_config.download_dir = update_download_dir_remote_url( + connector_name="azure", + read_config=self.read_config, + remote_url=remote_url, + logger=logger, ) - read_config.download_dir = update_download_dir_remote_url( - connector_name="azure", - read_config=read_config, - remote_url=remote_url, - logger=logger, - ) - - from unstructured.ingest.connector.azure import ( - AzureBlobStorageSourceConnector, - SimpleAzureBlobStorageConfig, - ) - - if account_name: - access_kwargs = { - "account_name": account_name, - "account_key": account_key, - } - elif connection_string: - access_kwargs = {"connection_string": connection_string} - else: - access_kwargs = {} - source_doc_connector = AzureBlobStorageSourceConnector( # type: ignore - connector_config=SimpleAzureBlobStorageConfig( - path=remote_url, - recursive=recursive, - access_kwargs=access_kwargs, - ), - read_config=read_config, - partition_config=partition_config, - ) + from unstructured.ingest.connector.azure import ( + AzureBlobStorageSourceConnector, + SimpleAzureBlobStorageConfig, + ) - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) + if account_name: + access_kwargs = { + "account_name": account_name, + "account_key": account_key, + } + elif connection_string: + access_kwargs = {"connection_string": connection_string} + else: + access_kwargs = {} + source_doc_connector = AzureBlobStorageSourceConnector( # type: ignore + processor_config=self.processor_config, + connector_config=SimpleAzureBlobStorageConfig( + path=remote_url, + recursive=recursive, + access_kwargs=access_kwargs, + ), + read_config=self.read_config, + ) - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) + self.process_documents( + source_doc_connector=source_doc_connector, + ) diff --git a/unstructured/ingest/runner/base_runner.py b/unstructured/ingest/runner/base_runner.py index c12bdce1e0..985e802232 100644 --- a/unstructured/ingest/runner/base_runner.py +++ b/unstructured/ingest/runner/base_runner.py @@ -3,18 +3,23 @@ from dataclasses import dataclass from unstructured.ingest.interfaces import ( + BaseDestinationConnector, + BaseSourceConnector, ChunkingConfig, EmbeddingConfig, PartitionConfig, + ProcessorConfig, ReadConfig, ) +from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.writers import writer_map @dataclass class Runner(ABC): + processor_config: ProcessorConfig read_config: ReadConfig partition_config: PartitionConfig - verbose: bool = False writer_type: t.Optional[str] = None writer_kwargs: t.Optional[dict] = None embedding_config: t.Optional[EmbeddingConfig] = None @@ -23,3 +28,20 @@ class Runner(ABC): @abstractmethod def run(self, *args, **kwargs): pass + + def get_dest_doc_connector(self) -> t.Optional[BaseDestinationConnector]: + writer_kwargs = self.writer_kwargs if self.writer_kwargs else {} + if self.writer_type: + writer = writer_map[self.writer_type] + return writer(**writer_kwargs) + return None + + def process_documents(self, source_doc_connector: BaseSourceConnector): + process_documents( + processor_config=self.processor_config, + source_doc_connector=source_doc_connector, + partition_config=self.partition_config, + dest_doc_connector=self.get_dest_doc_connector(), + embedder_config=self.embedding_config, + chunking_config=self.chunking_config, + ) diff --git a/unstructured/ingest/runner/biomed.py b/unstructured/ingest/runner/biomed.py index fe23aa34ca..2032709753 100644 --- a/unstructured/ingest/runner/biomed.py +++ b/unstructured/ingest/runner/biomed.py @@ -2,79 +2,62 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_hash -from unstructured.ingest.runner.writers import writer_map -def biomed( - verbose: bool, - read_config: ReadConfig, - partition_config: PartitionConfig, - max_retries: int, - max_request_time: int, - decay: float, - path: t.Optional[str] = None, - api_id: t.Optional[str] = None, - api_from: t.Optional[str] = None, - api_until: t.Optional[str] = None, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - base_path = ( - path - if path - else "{}-{}-{}".format( - api_id if api_id else "", - api_from if api_from else "", - api_until if api_until else "", +class BiomedRunner(Runner): + def run( + self, + max_retries: int, + max_request_time: int, + decay: float, + path: t.Optional[str] = None, + api_id: t.Optional[str] = None, + api_from: t.Optional[str] = None, + api_until: t.Optional[str] = None, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + base_path = ( + path + if path + else "{}-{}-{}".format( + api_id if api_id else "", + api_from if api_from else "", + api_until if api_until else "", + ) ) - ) - - hashed_dir_name = hashlib.sha256( - base_path.encode("utf-8"), - ) - read_config.download_dir = update_download_dir_hash( - connector_name="biomed", - read_config=read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) + hashed_dir_name = hashlib.sha256( + base_path.encode("utf-8"), + ) - from unstructured.ingest.connector.biomed import ( - BiomedSourceConnector, - SimpleBiomedConfig, - ) + self.read_config.download_dir = update_download_dir_hash( + connector_name="biomed", + read_config=self.read_config, + hashed_dir_name=hashed_dir_name, + logger=logger, + ) - source_doc_connector = BiomedSourceConnector( # type: ignore - connector_config=SimpleBiomedConfig( - path=path, - id_=api_id, - from_=api_from, - until=api_until, - max_retries=max_retries, - request_timeout=max_request_time, - decay=decay, - ), - read_config=read_config, - partition_config=partition_config, - ) + from unstructured.ingest.connector.biomed import ( + BiomedSourceConnector, + SimpleBiomedConfig, + ) - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) + source_doc_connector = BiomedSourceConnector( # type: ignore + processor_config=self.processor_config, + connector_config=SimpleBiomedConfig( + path=path, + id_=api_id, + from_=api_from, + until=api_until, + max_retries=max_retries, + request_timeout=max_request_time, + decay=decay, + ), + read_config=self.read_config, + ) - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/box.py b/unstructured/ingest/runner/box.py index 20f066dfa3..b7a1eeb885 100644 --- a/unstructured/ingest/runner/box.py +++ b/unstructured/ingest/runner/box.py @@ -1,54 +1,38 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_remote_url -from unstructured.ingest.runner.writers import writer_map -def box( - read_config: ReadConfig, - partition_config: PartitionConfig, - remote_url: str, - verbose: bool = False, - recursive: bool = False, - box_app_config: t.Optional[str] = None, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - read_config.download_dir = update_download_dir_remote_url( - connector_name="box", - read_config=read_config, - remote_url=remote_url, - logger=logger, - ) - - from unstructured.ingest.connector.box import BoxSourceConnector, SimpleBoxConfig - - source_doc_connector = BoxSourceConnector( # type: ignore - read_config=read_config, - connector_config=SimpleBoxConfig( - path=remote_url, - recursive=recursive, - access_kwargs={"box_app_config": box_app_config}, - ), - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) +class BoxRunner(Runner): + def run( + self, + remote_url: str, + recursive: bool = False, + box_app_config: t.Optional[str] = None, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + self.read_config.download_dir = update_download_dir_remote_url( + connector_name="box", + read_config=self.read_config, + remote_url=remote_url, + logger=logger, + ) + + from unstructured.ingest.connector.box import BoxSourceConnector, SimpleBoxConfig + + source_doc_connector = BoxSourceConnector( # type: ignore + read_config=self.read_config, + connector_config=SimpleBoxConfig( + path=remote_url, + recursive=recursive, + access_kwargs={"box_app_config": box_app_config}, + ), + processor_config=self.processor_config, + ) + + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/confluence.py b/unstructured/ingest/runner/confluence.py index 5192d07dc2..2cda95c6fb 100644 --- a/unstructured/ingest/runner/confluence.py +++ b/unstructured/ingest/runner/confluence.py @@ -2,69 +2,53 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_hash -from unstructured.ingest.runner.writers import writer_map -def confluence( - read_config: ReadConfig, - partition_config: PartitionConfig, - url: str, - user_email: str, - api_token: str, - max_num_of_spaces: int, - max_num_of_docs_from_each_space: int, - verbose: bool = False, - spaces: t.Optional[t.List[str]] = None, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - spaces = spaces if spaces else [] - writer_kwargs = writer_kwargs if writer_kwargs else {} - - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - url.encode("utf-8"), - ) - - read_config.download_dir = update_download_dir_hash( - connector_name="confluence", - read_config=read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.confluence import ( - ConfluenceSourceConnector, - SimpleConfluenceConfig, - ) - - source_doc_connector = ConfluenceSourceConnector( # type: ignore - connector_config=SimpleConfluenceConfig( - url=url, - user_email=user_email, - api_token=api_token, - spaces=spaces, - max_number_of_spaces=max_num_of_spaces, - max_number_of_docs_from_each_space=max_num_of_docs_from_each_space, - ), - read_config=read_config, - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) +class ConfluenceRunner(Runner): + def run( + self, + url: str, + user_email: str, + api_token: str, + max_num_of_spaces: int, + max_num_of_docs_from_each_space: int, + spaces: t.Optional[t.List[str]] = None, + **kwargs, + ): + spaces = spaces if spaces else [] + + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + hashed_dir_name = hashlib.sha256( + url.encode("utf-8"), + ) + + self.read_config.download_dir = update_download_dir_hash( + connector_name="confluence", + read_config=self.read_config, + hashed_dir_name=hashed_dir_name, + logger=logger, + ) + + from unstructured.ingest.connector.confluence import ( + ConfluenceSourceConnector, + SimpleConfluenceConfig, + ) + + source_doc_connector = ConfluenceSourceConnector( # type: ignore + processor_config=self.processor_config, + connector_config=SimpleConfluenceConfig( + url=url, + user_email=user_email, + api_token=api_token, + spaces=spaces, + max_number_of_spaces=max_num_of_spaces, + max_number_of_docs_from_each_space=max_num_of_docs_from_each_space, + ), + read_config=self.read_config, + ) + + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/delta_table.py b/unstructured/ingest/runner/delta_table.py index f19a4d9c4a..888e9202f2 100644 --- a/unstructured/ingest/runner/delta_table.py +++ b/unstructured/ingest/runner/delta_table.py @@ -3,68 +3,50 @@ import typing as t from pathlib import Path -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_hash -from unstructured.ingest.runner.writers import writer_map -def delta_table( - read_config: ReadConfig, - partition_config: PartitionConfig, - table_uri: t.Union[str, Path], - version: t.Optional[int] = None, - storage_options: t.Optional[str] = None, - verbose: bool = False, - without_files: bool = False, - columns: t.Optional[t.List[str]] = None, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - str(table_uri).encode("utf-8"), - ) - read_config.download_dir = update_download_dir_hash( - connector_name="delta_table", - read_config=read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.delta_table import ( - DeltaTableSourceConnector, - SimpleDeltaTableConfig, - ) - - source_doc_connector = DeltaTableSourceConnector( - connector_config=SimpleDeltaTableConfig( - verbose=verbose, - table_uri=table_uri, - version=version, - storage_options=SimpleDeltaTableConfig.storage_options_from_str(storage_options) - if storage_options - else None, - without_files=without_files, - columns=columns, - ), - read_config=read_config, - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) +class DeltaTableRunner(Runner): + def run( + self, + table_uri: t.Union[str, Path], + version: t.Optional[int] = None, + storage_options: t.Optional[str] = None, + without_files: bool = False, + columns: t.Optional[t.List[str]] = None, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + hashed_dir_name = hashlib.sha256( + str(table_uri).encode("utf-8"), + ) + self.read_config.download_dir = update_download_dir_hash( + connector_name="delta_table", + read_config=self.read_config, + hashed_dir_name=hashed_dir_name, + logger=logger, + ) + + from unstructured.ingest.connector.delta_table import ( + DeltaTableSourceConnector, + SimpleDeltaTableConfig, + ) + + source_doc_connector = DeltaTableSourceConnector( + connector_config=SimpleDeltaTableConfig( + table_uri=table_uri, + version=version, + storage_options=SimpleDeltaTableConfig.storage_options_from_str(storage_options) + if storage_options + else None, + without_files=without_files, + columns=columns, + ), + read_config=self.read_config, + processor_config=self.processor_config, + ) + + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/discord.py b/unstructured/ingest/runner/discord.py index de1a7d4cbb..ca26a868bf 100644 --- a/unstructured/ingest/runner/discord.py +++ b/unstructured/ingest/runner/discord.py @@ -2,63 +2,45 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_hash -from unstructured.ingest.runner.writers import writer_map -def discord( - read_config: ReadConfig, - partition_config: PartitionConfig, - channels: t.List[str], - token: str, - verbose: bool = False, - period: t.Optional[int] = None, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - ",".join(channels).encode("utf-8"), - ) - - read_config.download_dir = update_download_dir_hash( - connector_name="discord", - read_config=read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.discord import ( - DiscordSourceConnector, - SimpleDiscordConfig, - ) - - source_doc_connector = DiscordSourceConnector( # type: ignore - connector_config=SimpleDiscordConfig( - channels=channels, - days=period, - token=token, - verbose=verbose, - ), - read_config=read_config, - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) +class DiscordRunner(Runner): + def run( + self, + channels: t.List[str], + token: str, + period: t.Optional[int] = None, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + hashed_dir_name = hashlib.sha256( + ",".join(channels).encode("utf-8"), + ) + + self.read_config.download_dir = update_download_dir_hash( + connector_name="discord", + read_config=self.read_config, + hashed_dir_name=hashed_dir_name, + logger=logger, + ) + + from unstructured.ingest.connector.discord import ( + DiscordSourceConnector, + SimpleDiscordConfig, + ) + + source_doc_connector = DiscordSourceConnector( # type: ignore + connector_config=SimpleDiscordConfig( + channels=channels, + days=period, + token=token, + ), + read_config=self.read_config, + processor_config=self.processor_config, + ) + + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/dropbox.py b/unstructured/ingest/runner/dropbox.py index e30ab36af3..7dcfa314e9 100644 --- a/unstructured/ingest/runner/dropbox.py +++ b/unstructured/ingest/runner/dropbox.py @@ -1,57 +1,41 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_remote_url -from unstructured.ingest.runner.writers import writer_map -def dropbox( - read_config: ReadConfig, - partition_config: PartitionConfig, - remote_url: str, - verbose: bool = False, - recursive: bool = False, - token: t.Optional[str] = None, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - read_config.download_dir = update_download_dir_remote_url( - connector_name="dropbox", - read_config=read_config, - remote_url=remote_url, - logger=logger, - ) - - from unstructured.ingest.connector.dropbox import ( - DropboxSourceConnector, - SimpleDropboxConfig, - ) - - source_doc_connector = DropboxSourceConnector( # type: ignore - read_config=read_config, - connector_config=SimpleDropboxConfig( - path=remote_url, - recursive=recursive, - access_kwargs={"token": token}, - ), - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) +class DropboxRunner(Runner): + def run( + self, + remote_url: str, + recursive: bool = False, + token: t.Optional[str] = None, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + self.read_config.download_dir = update_download_dir_remote_url( + connector_name="dropbox", + read_config=self.read_config, + remote_url=remote_url, + logger=logger, + ) + + from unstructured.ingest.connector.dropbox import ( + DropboxSourceConnector, + SimpleDropboxConfig, + ) + + source_doc_connector = DropboxSourceConnector( # type: ignore + read_config=self.read_config, + connector_config=SimpleDropboxConfig( + path=remote_url, + recursive=recursive, + access_kwargs={"token": token}, + ), + processor_config=self.processor_config, + ) + + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/elasticsearch.py b/unstructured/ingest/runner/elasticsearch.py index 8c5a511576..1aaf65b938 100644 --- a/unstructured/ingest/runner/elasticsearch.py +++ b/unstructured/ingest/runner/elasticsearch.py @@ -2,64 +2,47 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_hash -from unstructured.ingest.runner.writers import writer_map -def elasticsearch( - read_config: ReadConfig, - partition_config: PartitionConfig, - url: str, - index_name: str, - verbose: bool = False, - jq_query: t.Optional[str] = None, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - f"{url}_{index_name}".encode( - "utf-8", - ), - ) - - read_config.download_dir = update_download_dir_hash( - connector_name="elasticsearch", - read_config=read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.elasticsearch import ( - ElasticsearchSourceConnector, - SimpleElasticsearchConfig, - ) - - source_doc_connector = ElasticsearchSourceConnector( # type: ignore - connector_config=SimpleElasticsearchConfig( - url=url, - index_name=index_name, - jq_query=jq_query, - ), - read_config=read_config, - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) +class ElasticSearchRunner(Runner): + def run( + self, + url: str, + index_name: str, + jq_query: t.Optional[str] = None, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + hashed_dir_name = hashlib.sha256( + f"{url}_{index_name}".encode( + "utf-8", + ), + ) + + self.read_config.download_dir = update_download_dir_hash( + connector_name="elasticsearch", + read_config=self.read_config, + hashed_dir_name=hashed_dir_name, + logger=logger, + ) + + from unstructured.ingest.connector.elasticsearch import ( + ElasticsearchSourceConnector, + SimpleElasticsearchConfig, + ) + + source_doc_connector = ElasticsearchSourceConnector( # type: ignore + connector_config=SimpleElasticsearchConfig( + url=url, + index_name=index_name, + jq_query=jq_query, + ), + read_config=self.read_config, + processor_config=self.processor_config, + ) + + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/fsspec.py b/unstructured/ingest/runner/fsspec.py index 7822b30140..0d4d6ea6b4 100644 --- a/unstructured/ingest/runner/fsspec.py +++ b/unstructured/ingest/runner/fsspec.py @@ -1,65 +1,48 @@ import logging -import typing as t import warnings from urllib.parse import urlparse -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_remote_url -from unstructured.ingest.runner.writers import writer_map -def fsspec( - read_config: ReadConfig, - partition_config: PartitionConfig, - remote_url: str, - verbose: bool = False, - recursive: bool = False, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - read_config.download_dir = update_download_dir_remote_url( - connector_name="fsspec", - read_config=read_config, - remote_url=remote_url, - logger=logger, - ) - - protocol = urlparse(remote_url).scheme - warnings.warn( - f"`fsspec` protocol {protocol} is not directly supported by `unstructured`," - " so use it at your own risk. Supported protocols are `gcs`, `gs`, `s3`, `s3a`," - "`dropbox`, `abfs` and `az`.", - UserWarning, - ) - - from unstructured.ingest.connector.fsspec import ( - FsspecSourceConnector, - SimpleFsspecConfig, - ) - - source_doc_connector = FsspecSourceConnector( # type: ignore - connector_config=SimpleFsspecConfig( - path=remote_url, - recursive=recursive, - ), - read_config=read_config, - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) +class FsspecRunner(Runner): + def run( + self, + remote_url: str, + recursive: bool = False, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + self.read_config.download_dir = update_download_dir_remote_url( + connector_name="fsspec", + read_config=self.read_config, + remote_url=remote_url, + logger=logger, + ) + + protocol = urlparse(remote_url).scheme + warnings.warn( + f"`fsspec` protocol {protocol} is not directly supported by `unstructured`," + " so use it at your own risk. Supported protocols are `gcs`, `gs`, `s3`, `s3a`," + "`dropbox`, `abfs` and `az`.", + UserWarning, + ) + + from unstructured.ingest.connector.fsspec import ( + FsspecSourceConnector, + SimpleFsspecConfig, + ) + + source_doc_connector = FsspecSourceConnector( # type: ignore + connector_config=SimpleFsspecConfig( + path=remote_url, + recursive=recursive, + ), + read_config=self.read_config, + processor_config=self.processor_config, + ) + + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/gcs.py b/unstructured/ingest/runner/gcs.py index a442a28916..d370f4f43d 100644 --- a/unstructured/ingest/runner/gcs.py +++ b/unstructured/ingest/runner/gcs.py @@ -1,54 +1,38 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_remote_url -from unstructured.ingest.runner.writers import writer_map -def gcs( - read_config: ReadConfig, - partition_config: PartitionConfig, - remote_url: str, - verbose: bool = False, - recursive: bool = False, - token: t.Optional[str] = None, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - read_config.download_dir = update_download_dir_remote_url( - connector_name="gcs", - read_config=read_config, - remote_url=remote_url, - logger=logger, - ) - - from unstructured.ingest.connector.gcs import GcsSourceConnector, SimpleGcsConfig - - source_doc_connector = GcsSourceConnector( # type: ignore - connector_config=SimpleGcsConfig( - path=remote_url, - recursive=recursive, - access_kwargs={"token": token}, - ), - read_config=read_config, - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) +class GCSRunner(Runner): + def run( + self, + remote_url: str, + recursive: bool = False, + token: t.Optional[str] = None, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + self.read_config.download_dir = update_download_dir_remote_url( + connector_name="gcs", + read_config=self.read_config, + remote_url=remote_url, + logger=logger, + ) + + from unstructured.ingest.connector.gcs import GcsSourceConnector, SimpleGcsConfig + + source_doc_connector = GcsSourceConnector( # type: ignore + connector_config=SimpleGcsConfig( + path=remote_url, + recursive=recursive, + access_kwargs={"token": token}, + ), + read_config=self.read_config, + processor_config=self.processor_config, + ) + + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/github.py b/unstructured/ingest/runner/github.py index ff726da597..0b1e1835f7 100644 --- a/unstructured/ingest/runner/github.py +++ b/unstructured/ingest/runner/github.py @@ -2,66 +2,49 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_hash -from unstructured.ingest.runner.writers import writer_map -def github( - read_config: ReadConfig, - partition_config: PartitionConfig, - url: str, - git_branch: str, - verbose: bool = False, - git_access_token: t.Optional[str] = None, - git_file_glob: t.Optional[str] = None, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - f"{url}_{git_branch}".encode( - "utf-8", - ), - ) - - read_config.download_dir = update_download_dir_hash( - connector_name="github", - read_config=read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.github import ( - GitHubSourceConnector, - SimpleGitHubConfig, - ) - - source_doc_connector = GitHubSourceConnector( # type: ignore - connector_config=SimpleGitHubConfig( - url=url, - access_token=git_access_token, - branch=git_branch, - file_glob=git_file_glob, - ), - read_config=read_config, - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) +class GithubRunner(Runner): + def run( + self, + url: str, + git_branch: str, + git_access_token: t.Optional[str] = None, + git_file_glob: t.Optional[str] = None, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + hashed_dir_name = hashlib.sha256( + f"{url}_{git_branch}".encode( + "utf-8", + ), + ) + + self.read_config.download_dir = update_download_dir_hash( + connector_name="github", + read_config=self.read_config, + hashed_dir_name=hashed_dir_name, + logger=logger, + ) + + from unstructured.ingest.connector.github import ( + GitHubSourceConnector, + SimpleGitHubConfig, + ) + + source_doc_connector = GitHubSourceConnector( # type: ignore + connector_config=SimpleGitHubConfig( + url=url, + access_token=git_access_token, + branch=git_branch, + file_glob=git_file_glob, + ), + read_config=self.read_config, + processor_config=self.processor_config, + ) + + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/gitlab.py b/unstructured/ingest/runner/gitlab.py index a4e6d9b947..dc03b39a79 100644 --- a/unstructured/ingest/runner/gitlab.py +++ b/unstructured/ingest/runner/gitlab.py @@ -2,66 +2,49 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_hash -from unstructured.ingest.runner.writers import writer_map -def gitlab( - read_config: ReadConfig, - partition_config: PartitionConfig, - url: str, - git_branch: str, - verbose: bool = False, - git_access_token: t.Optional[str] = None, - git_file_glob: t.Optional[str] = None, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - f"{url}_{git_branch}".encode( - "utf-8", - ), - ) - - read_config.download_dir = update_download_dir_hash( - connector_name="gitlab", - read_config=read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.gitlab import ( - GitLabSourceConnector, - SimpleGitLabConfig, - ) - - source_doc_connector = GitLabSourceConnector( # type: ignore - connector_config=SimpleGitLabConfig( - url=url, - access_token=git_access_token, - branch=git_branch, - file_glob=git_file_glob, - ), - read_config=read_config, - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) +class GitlabRunner(Runner): + def run( + self, + url: str, + git_branch: str, + git_access_token: t.Optional[str] = None, + git_file_glob: t.Optional[str] = None, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + hashed_dir_name = hashlib.sha256( + f"{url}_{git_branch}".encode( + "utf-8", + ), + ) + + self.read_config.download_dir = update_download_dir_hash( + connector_name="gitlab", + read_config=self.read_config, + hashed_dir_name=hashed_dir_name, + logger=logger, + ) + + from unstructured.ingest.connector.gitlab import ( + GitLabSourceConnector, + SimpleGitLabConfig, + ) + + source_doc_connector = GitLabSourceConnector( # type: ignore + connector_config=SimpleGitLabConfig( + url=url, + access_token=git_access_token, + branch=git_branch, + file_glob=git_file_glob, + ), + read_config=self.read_config, + processor_config=self.processor_config, + ) + + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/google_drive.py b/unstructured/ingest/runner/google_drive.py index 27ad5979bd..3635a9b3b5 100644 --- a/unstructured/ingest/runner/google_drive.py +++ b/unstructured/ingest/runner/google_drive.py @@ -2,64 +2,47 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_hash -from unstructured.ingest.runner.writers import writer_map -def gdrive( - read_config: ReadConfig, - partition_config: PartitionConfig, - service_account_key: str, - drive_id: str, - verbose: bool = False, - recursive: bool = False, - extension: t.Optional[str] = None, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - drive_id.encode("utf-8"), - ) - - read_config.download_dir = update_download_dir_hash( - connector_name="gdrive", - read_config=read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.google_drive import ( - GoogleDriveSourceConnector, - SimpleGoogleDriveConfig, - ) - - source_doc_connector = GoogleDriveSourceConnector( # type: ignore - connector_config=SimpleGoogleDriveConfig( - drive_id=drive_id, - service_account_key=service_account_key, - recursive=recursive, - extension=extension, - ), - read_config=read_config, - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) +class GoogleDriveRunner(Runner): + def run( + self, + service_account_key: str, + drive_id: str, + recursive: bool = False, + extension: t.Optional[str] = None, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + hashed_dir_name = hashlib.sha256( + drive_id.encode("utf-8"), + ) + + self.read_config.download_dir = update_download_dir_hash( + connector_name="google_drive", + read_config=self.read_config, + hashed_dir_name=hashed_dir_name, + logger=logger, + ) + + from unstructured.ingest.connector.google_drive import ( + GoogleDriveSourceConnector, + SimpleGoogleDriveConfig, + ) + + source_doc_connector = GoogleDriveSourceConnector( # type: ignore + connector_config=SimpleGoogleDriveConfig( + drive_id=drive_id, + service_account_key=service_account_key, + recursive=recursive, + extension=extension, + ), + read_config=self.read_config, + processor_config=self.processor_config, + ) + + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/jira.py b/unstructured/ingest/runner/jira.py index e9875e51ee..4c4e160ae2 100644 --- a/unstructured/ingest/runner/jira.py +++ b/unstructured/ingest/runner/jira.py @@ -2,71 +2,54 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_hash -from unstructured.ingest.runner.writers import writer_map -def jira( - read_config: ReadConfig, - partition_config: PartitionConfig, - url: str, - user_email: str, - api_token: str, - verbose: bool = False, - projects: t.Optional[t.List[str]] = None, - boards: t.Optional[t.List[str]] = None, - issues: t.Optional[t.List[str]] = None, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - - projects = projects if projects else [] - boards = boards if boards else [] - issues = issues if issues else [] - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - url.encode("utf-8"), - ) - - read_config.download_dir = update_download_dir_hash( - connector_name="jira", - read_config=read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.jira import ( - JiraSourceConnector, - SimpleJiraConfig, - ) - - source_doc_connector = JiraSourceConnector( # type: ignore - connector_config=SimpleJiraConfig( - url=url, - user_email=user_email, - api_token=api_token, - projects=projects, - boards=boards, - issues=issues, - ), - read_config=read_config, - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) +class JiraRunner(Runner): + def run( + self, + url: str, + user_email: str, + api_token: str, + projects: t.Optional[t.List[str]] = None, + boards: t.Optional[t.List[str]] = None, + issues: t.Optional[t.List[str]] = None, + **kwargs, + ): + projects = projects if projects else [] + boards = boards if boards else [] + issues = issues if issues else [] + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + hashed_dir_name = hashlib.sha256( + url.encode("utf-8"), + ) + + self.read_config.download_dir = update_download_dir_hash( + connector_name="jira", + read_config=self.read_config, + hashed_dir_name=hashed_dir_name, + logger=logger, + ) + + from unstructured.ingest.connector.jira import ( + JiraSourceConnector, + SimpleJiraConfig, + ) + + source_doc_connector = JiraSourceConnector( # type: ignore + connector_config=SimpleJiraConfig( + url=url, + user_email=user_email, + api_token=api_token, + projects=projects, + boards=boards, + issues=issues, + ), + read_config=self.read_config, + processor_config=self.processor_config, + ) + + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/local.py b/unstructured/ingest/runner/local.py index a52ee598ec..c5400ba6b4 100644 --- a/unstructured/ingest/runner/local.py +++ b/unstructured/ingest/runner/local.py @@ -1,50 +1,33 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.writers import writer_map - - -def local( - read_config: ReadConfig, - partition_config: PartitionConfig, - input_path: str, - verbose: bool = False, - recursive: bool = False, - file_glob: t.Optional[str] = None, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - from unstructured.ingest.connector.local import ( - LocalSourceConnector, - SimpleLocalConfig, - ) - - source_doc_connector = LocalSourceConnector( # type: ignore - connector_config=SimpleLocalConfig( - input_path=input_path, - recursive=recursive, - file_glob=file_glob, - ), - read_config=read_config, - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) +from unstructured.ingest.runner.base_runner import Runner + + +class LocalRunner(Runner): + def run( + self, + input_path: str, + recursive: bool = False, + file_glob: t.Optional[str] = None, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + from unstructured.ingest.connector.local import ( + LocalSourceConnector, + SimpleLocalConfig, + ) + + source_doc_connector = LocalSourceConnector( # type: ignore + connector_config=SimpleLocalConfig( + input_path=input_path, + recursive=recursive, + file_glob=file_glob, + ), + read_config=self.read_config, + processor_config=self.processor_config, + ) + + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/notion.py b/unstructured/ingest/runner/notion.py index 7aa22e9c4e..82b3a8095c 100644 --- a/unstructured/ingest/runner/notion.py +++ b/unstructured/ingest/runner/notion.py @@ -3,80 +3,63 @@ import typing as t from uuid import UUID -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_hash -from unstructured.ingest.runner.writers import writer_map -def notion( - read_config: ReadConfig, - partition_config: PartitionConfig, - api_key: str, - verbose: bool = False, - recursive: bool = False, - page_ids: t.Optional[t.List[str]] = None, - database_ids: t.Optional[t.List[str]] = None, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - page_ids = [str(UUID(p.strip())) for p in page_ids] if page_ids else [] - database_ids = [str(UUID(d.strip())) for d in database_ids] if database_ids else [] - writer_kwargs = writer_kwargs if writer_kwargs else {} +class NotionRunner(Runner): + def run( + self, + api_key: str, + recursive: bool = False, + page_ids: t.Optional[t.List[str]] = None, + database_ids: t.Optional[t.List[str]] = None, + **kwargs, + ): + page_ids = [str(UUID(p.strip())) for p in page_ids] if page_ids else [] + database_ids = [str(UUID(d.strip())) for d in database_ids] if database_ids else [] - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - if not page_ids and not database_ids: - raise ValueError("no page ids nor database ids provided") + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + if not page_ids and not database_ids: + raise ValueError("no page ids nor database ids provided") - if page_ids and database_ids: - hashed_dir_name = hashlib.sha256( - "{},{}".format(",".join(page_ids), ",".join(database_ids)).encode("utf-8"), - ) - elif page_ids: - hashed_dir_name = hashlib.sha256( - ",".join(page_ids).encode("utf-8"), - ) - elif database_ids: - hashed_dir_name = hashlib.sha256( - ",".join(database_ids).encode("utf-8"), - ) - else: - raise ValueError("could not create local cache directory name") - - read_config.download_dir = update_download_dir_hash( - connector_name="notion", - read_config=read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) + if page_ids and database_ids: + hashed_dir_name = hashlib.sha256( + "{},{}".format(",".join(page_ids), ",".join(database_ids)).encode("utf-8"), + ) + elif page_ids: + hashed_dir_name = hashlib.sha256( + ",".join(page_ids).encode("utf-8"), + ) + elif database_ids: + hashed_dir_name = hashlib.sha256( + ",".join(database_ids).encode("utf-8"), + ) + else: + raise ValueError("could not create local cache directory name") - from unstructured.ingest.connector.notion.connector import ( - NotionSourceConnector, - SimpleNotionConfig, - ) + self.read_config.download_dir = update_download_dir_hash( + connector_name="notion", + read_config=self.read_config, + hashed_dir_name=hashed_dir_name, + logger=logger, + ) - source_doc_connector = NotionSourceConnector( # type: ignore - connector_config=SimpleNotionConfig( - page_ids=page_ids, - database_ids=database_ids, - api_key=api_key, - verbose=verbose, - recursive=recursive, - ), - read_config=read_config, - partition_config=partition_config, - ) + from unstructured.ingest.connector.notion.connector import ( + NotionSourceConnector, + SimpleNotionConfig, + ) - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) + source_doc_connector = NotionSourceConnector( # type: ignore + connector_config=SimpleNotionConfig( + page_ids=page_ids, + database_ids=database_ids, + api_key=api_key, + recursive=recursive, + ), + read_config=self.read_config, + processor_config=self.processor_config, + ) - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/onedrive.py b/unstructured/ingest/runner/onedrive.py index abf3d18938..f9d9e95b7f 100644 --- a/unstructured/ingest/runner/onedrive.py +++ b/unstructured/ingest/runner/onedrive.py @@ -2,70 +2,53 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_hash -from unstructured.ingest.runner.writers import writer_map -def onedrive( - read_config: ReadConfig, - partition_config: PartitionConfig, - tenant: str, - user_pname: str, - client_id: str, - client_cred: str, - verbose: bool = False, - authority_url: t.Optional[str] = None, - path: t.Optional[str] = None, - recursive: bool = False, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - f"{tenant}_{user_pname}".encode("utf-8"), - ) - - read_config.download_dir = update_download_dir_hash( - connector_name="onedrive", - read_config=read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.onedrive import ( - OneDriveSourceConnector, - SimpleOneDriveConfig, - ) - - source_doc_connector = OneDriveSourceConnector( # type: ignore - connector_config=SimpleOneDriveConfig( - client_id=client_id, - client_credential=client_cred, - user_pname=user_pname, - tenant=tenant, - authority_url=authority_url, - path=path, - recursive=recursive, - ), - read_config=read_config, - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) +class OneDriveRunner(Runner): + def run( + self, + tenant: str, + user_pname: str, + client_id: str, + client_cred: str, + authority_url: t.Optional[str] = None, + path: t.Optional[str] = None, + recursive: bool = False, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + hashed_dir_name = hashlib.sha256( + f"{tenant}_{user_pname}".encode("utf-8"), + ) + + self.read_config.download_dir = update_download_dir_hash( + connector_name="onedrive", + read_config=self.read_config, + hashed_dir_name=hashed_dir_name, + logger=logger, + ) + + from unstructured.ingest.connector.onedrive import ( + OneDriveSourceConnector, + SimpleOneDriveConfig, + ) + + source_doc_connector = OneDriveSourceConnector( # type: ignore + connector_config=SimpleOneDriveConfig( + client_id=client_id, + client_credential=client_cred, + user_pname=user_pname, + tenant=tenant, + authority_url=authority_url, + path=path, + recursive=recursive, + ), + read_config=self.read_config, + processor_config=self.processor_config, + ) + + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/outlook.py b/unstructured/ingest/runner/outlook.py index d0613ce340..04dd724bc3 100644 --- a/unstructured/ingest/runner/outlook.py +++ b/unstructured/ingest/runner/outlook.py @@ -2,68 +2,52 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_hash -from unstructured.ingest.runner.writers import writer_map -def outlook( - read_config: ReadConfig, - partition_config: PartitionConfig, - user_email: str, - verbose: bool = False, - recursive: bool = False, - client_id: t.Optional[str] = None, - client_cred: t.Optional[str] = None, - tenant: t.Optional[str] = None, - authority_url: t.Optional[str] = None, - outlook_folders: t.Optional[t.List[str]] = None, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - outlook_folders = outlook_folders if outlook_folders else [] - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256(user_email.encode("utf-8")) - - read_config.download_dir = update_download_dir_hash( - connector_name="outlook", - read_config=read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.outlook import ( - OutlookSourceConnector, - SimpleOutlookConfig, - ) - - source_doc_connector = OutlookSourceConnector( # type: ignore - connector_config=SimpleOutlookConfig( - client_id=client_id, - client_credential=client_cred, - user_email=user_email, - tenant=tenant, - authority_url=authority_url, - ms_outlook_folders=outlook_folders, - recursive=recursive, - ), - read_config=read_config, - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) +class OutlookRunner(Runner): + def run( + self, + user_email: str, + recursive: bool = False, + client_id: t.Optional[str] = None, + client_cred: t.Optional[str] = None, + tenant: t.Optional[str] = None, + authority_url: t.Optional[str] = None, + outlook_folders: t.Optional[t.List[str]] = None, + **kwargs, + ): + outlook_folders = outlook_folders if outlook_folders else [] + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + hashed_dir_name = hashlib.sha256(user_email.encode("utf-8")) + + self.read_config.download_dir = update_download_dir_hash( + connector_name="outlook", + read_config=self.read_config, + hashed_dir_name=hashed_dir_name, + logger=logger, + ) + + from unstructured.ingest.connector.outlook import ( + OutlookSourceConnector, + SimpleOutlookConfig, + ) + + source_doc_connector = OutlookSourceConnector( # type: ignore + connector_config=SimpleOutlookConfig( + client_id=client_id, + client_credential=client_cred, + user_email=user_email, + tenant=tenant, + authority_url=authority_url, + ms_outlook_folders=outlook_folders, + recursive=recursive, + ), + read_config=self.read_config, + processor_config=self.processor_config, + ) + + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/reddit.py b/unstructured/ingest/runner/reddit.py index fea56f1f12..f455eb2087 100644 --- a/unstructured/ingest/runner/reddit.py +++ b/unstructured/ingest/runner/reddit.py @@ -2,68 +2,51 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_hash -from unstructured.ingest.runner.writers import writer_map -def reddit( - read_config: ReadConfig, - partition_config: PartitionConfig, - subreddit_name: str, - user_agent: str, - num_posts: int, - verbose: bool = False, - client_id: t.Optional[str] = None, - client_secret: t.Optional[str] = None, - search_query: t.Optional[str] = None, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - subreddit_name.encode("utf-8"), - ) - - read_config.download_dir = update_download_dir_hash( - connector_name="reddit", - read_config=read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.reddit import ( - RedditSourceConnector, - SimpleRedditConfig, - ) - - source_doc_connector = RedditSourceConnector( # type: ignore - connector_config=SimpleRedditConfig( - subreddit_name=subreddit_name, - client_id=client_id, - client_secret=client_secret, - user_agent=user_agent, - search_query=search_query, - num_posts=num_posts, - ), - read_config=read_config, - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) +class RedditRunner(Runner): + def run( + self, + subreddit_name: str, + user_agent: str, + num_posts: int, + client_id: t.Optional[str] = None, + client_secret: t.Optional[str] = None, + search_query: t.Optional[str] = None, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + hashed_dir_name = hashlib.sha256( + subreddit_name.encode("utf-8"), + ) + + self.read_config.download_dir = update_download_dir_hash( + connector_name="reddit", + read_config=self.read_config, + hashed_dir_name=hashed_dir_name, + logger=logger, + ) + + from unstructured.ingest.connector.reddit import ( + RedditSourceConnector, + SimpleRedditConfig, + ) + + source_doc_connector = RedditSourceConnector( # type: ignore + connector_config=SimpleRedditConfig( + subreddit_name=subreddit_name, + client_id=client_id, + client_secret=client_secret, + user_agent=user_agent, + search_query=search_query, + num_posts=num_posts, + ), + read_config=self.read_config, + processor_config=self.processor_config, + ) + + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/s3.py b/unstructured/ingest/runner/s3.py index e3646305fa..ad510de083 100644 --- a/unstructured/ingest/runner/s3.py +++ b/unstructured/ingest/runner/s3.py @@ -1,58 +1,44 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_remote_url -from unstructured.ingest.runner.writers import writer_map -def s3( - read_config: ReadConfig, - partition_config: PartitionConfig, - remote_url: str, - verbose: bool = False, - recursive: bool = False, - anonymous: bool = False, - endpoint_url: t.Optional[str] = None, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - read_config.download_dir = update_download_dir_remote_url( - connector_name="s3", - read_config=read_config, - remote_url=remote_url, - logger=logger, - ) - - from unstructured.ingest.connector.s3 import S3SourceConnector, SimpleS3Config - - access_kwargs: t.Dict[str, t.Any] = {"anon": anonymous} - if endpoint_url: - access_kwargs["endpoint_url"] = endpoint_url - source_doc_connector = S3SourceConnector( # type: ignore - connector_config=SimpleS3Config( - path=remote_url, - recursive=recursive, - access_kwargs=access_kwargs, - ), - read_config=read_config, - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) +class S3Runner(Runner): + def run( + self, + remote_url: str, + recursive: bool = False, + anonymous: bool = False, + endpoint_url: t.Optional[str] = None, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + self.read_config.download_dir = update_download_dir_remote_url( + connector_name="s3", + read_config=self.read_config, + remote_url=remote_url, + logger=logger, + ) + + from unstructured.ingest.connector.s3 import S3SourceConnector, SimpleS3Config + + access_kwargs: t.Dict[str, t.Any] = {"anon": anonymous} + if endpoint_url: + access_kwargs["endpoint_url"] = endpoint_url + source_doc_connector = S3SourceConnector( # type: ignore + connector_config=SimpleS3Config( + path=remote_url, + recursive=recursive, + access_kwargs=access_kwargs, + ), + read_config=self.read_config, + processor_config=self.processor_config, + ) + + self.process_documents( + source_doc_connector=source_doc_connector, + ) diff --git a/unstructured/ingest/runner/salesforce.py b/unstructured/ingest/runner/salesforce.py index 415d9be79b..16ba029c62 100644 --- a/unstructured/ingest/runner/salesforce.py +++ b/unstructured/ingest/runner/salesforce.py @@ -2,63 +2,47 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_hash -from unstructured.ingest.runner.writers import writer_map -def salesforce( - read_config: ReadConfig, - partition_config: PartitionConfig, - username: str, - consumer_key: str, - private_key_path: str, - categories: t.List[str], - verbose: bool = False, - recursive: bool = False, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256(username.encode("utf-8")) - - read_config.download_dir = update_download_dir_hash( - connector_name="salesforce", - read_config=read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.salesforce import ( - SalesforceSourceConnector, - SimpleSalesforceConfig, - ) - - source_doc_connector = SalesforceSourceConnector( # type: ignore - connector_config=SimpleSalesforceConfig( - categories=categories, - username=username, - consumer_key=consumer_key, - private_key_path=private_key_path, - recursive=recursive, - ), - read_config=read_config, - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) +class SalesforceRunner(Runner): + def run( + self, + username: str, + consumer_key: str, + private_key_path: str, + categories: t.List[str], + recursive: bool = False, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + hashed_dir_name = hashlib.sha256(username.encode("utf-8")) + + self.read_config.download_dir = update_download_dir_hash( + connector_name="salesforce", + read_config=self.read_config, + hashed_dir_name=hashed_dir_name, + logger=logger, + ) + + from unstructured.ingest.connector.salesforce import ( + SalesforceSourceConnector, + SimpleSalesforceConfig, + ) + + source_doc_connector = SalesforceSourceConnector( # type: ignore + connector_config=SimpleSalesforceConfig( + categories=categories, + username=username, + consumer_key=consumer_key, + private_key_path=private_key_path, + recursive=recursive, + ), + read_config=self.read_config, + processor_config=self.processor_config, + ) + + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/sharepoint.py b/unstructured/ingest/runner/sharepoint.py index 1781e2cbd9..744abcb1a1 100644 --- a/unstructured/ingest/runner/sharepoint.py +++ b/unstructured/ingest/runner/sharepoint.py @@ -2,13 +2,11 @@ import logging from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_hash -from unstructured.ingest.runner.writers import writer_map -class SharePoint(Runner): +class SharePointRunner(Runner): def run( self, site: str, @@ -19,9 +17,7 @@ def run( recursive: bool = False, **kwargs, ): - writer_kwargs = self.writer_kwargs if self.writer_kwargs else {} - - ingest_log_streaming_init(logging.DEBUG if self.verbose else logging.INFO) + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) hashed_dir_name = hashlib.sha256( f"{site}_{path}".encode("utf-8"), @@ -40,6 +36,7 @@ def run( ) source_doc_connector = SharepointSourceConnector( # type: ignore + processor_config=self.processor_config, connector_config=SimpleSharepointConfig( client_id=client_id, client_credential=client_cred, @@ -49,19 +46,8 @@ def run( recursive=recursive, ), read_config=self.read_config, - partition_config=self.partition_config, - embedding_config=self.embedding_config, - chunking_config=self.chunking_config, ) - dest_doc_connector = None - if self.writer_type: - writer = writer_map[self.writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( + self.process_documents( source_doc_connector=source_doc_connector, - partition_config=self.partition_config, - verbose=self.verbose, - dest_doc_connector=dest_doc_connector, ) diff --git a/unstructured/ingest/runner/slack.py b/unstructured/ingest/runner/slack.py index 0b9919c216..7cc5757932 100644 --- a/unstructured/ingest/runner/slack.py +++ b/unstructured/ingest/runner/slack.py @@ -2,65 +2,47 @@ import logging import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_hash -from unstructured.ingest.runner.writers import writer_map -def slack( - read_config: ReadConfig, - partition_config: PartitionConfig, - channels: t.List[str], - token: str, - verbose: bool = False, - start_date: t.Optional[str] = None, - end_date: t.Optional[str] = None, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - ",".join(channels).encode("utf-8"), - ) - - read_config.download_dir = update_download_dir_hash( - connector_name="slack", - read_config=read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.slack import ( - SimpleSlackConfig, - SlackSourceConnector, - ) - - source_doc_connector = SlackSourceConnector( # type: ignore - connector_config=SimpleSlackConfig( - channels=channels, - token=token, - oldest=start_date, - latest=end_date, - verbose=verbose, - ), - read_config=read_config, - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) +class SlackRunner(Runner): + def run( + self, + channels: t.List[str], + token: str, + start_date: t.Optional[str] = None, + end_date: t.Optional[str] = None, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + hashed_dir_name = hashlib.sha256( + ",".join(channels).encode("utf-8"), + ) + + self.read_config.download_dir = update_download_dir_hash( + connector_name="slack", + read_config=self.read_config, + hashed_dir_name=hashed_dir_name, + logger=logger, + ) + + from unstructured.ingest.connector.slack import ( + SimpleSlackConfig, + SlackSourceConnector, + ) + + source_doc_connector = SlackSourceConnector( # type: ignore + connector_config=SimpleSlackConfig( + channels=channels, + token=token, + oldest=start_date, + latest=end_date, + ), + read_config=self.read_config, + processor_config=self.processor_config, + ) + + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/wikipedia.py b/unstructured/ingest/runner/wikipedia.py index 8914042cad..bb83409538 100644 --- a/unstructured/ingest/runner/wikipedia.py +++ b/unstructured/ingest/runner/wikipedia.py @@ -1,61 +1,43 @@ import hashlib import logging -import typing as t -from unstructured.ingest.interfaces import PartitionConfig, ReadConfig from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents +from unstructured.ingest.runner.base_runner import Runner from unstructured.ingest.runner.utils import update_download_dir_hash -from unstructured.ingest.runner.writers import writer_map - - -def wikipedia( - read_config: ReadConfig, - partition_config: PartitionConfig, - page_title: str, - verbose: bool = False, - auto_suggest: bool = False, - writer_type: t.Optional[str] = None, - writer_kwargs: t.Optional[dict] = None, - **kwargs, -): - writer_kwargs = writer_kwargs if writer_kwargs else {} - - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - page_title.encode("utf-8"), - ) - - read_config.download_dir = update_download_dir_hash( - connector_name="wikipedia", - read_config=read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.wikipedia import ( - SimpleWikipediaConfig, - WikipediaSourceConnector, - ) - - source_doc_connector = WikipediaSourceConnector( # type: ignore - connector_config=SimpleWikipediaConfig( - title=page_title, - auto_suggest=auto_suggest, - ), - read_config=read_config, - partition_config=partition_config, - ) - - dest_doc_connector = None - if writer_type: - writer = writer_map[writer_type] - dest_doc_connector = writer(**writer_kwargs) - - process_documents( - source_doc_connector=source_doc_connector, - partition_config=partition_config, - verbose=verbose, - dest_doc_connector=dest_doc_connector, - ) + + +class WikipediaRunner(Runner): + def run( + self, + page_title: str, + auto_suggest: bool = False, + **kwargs, + ): + ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) + + hashed_dir_name = hashlib.sha256( + page_title.encode("utf-8"), + ) + + self.read_config.download_dir = update_download_dir_hash( + connector_name="wikipedia", + read_config=self.read_config, + hashed_dir_name=hashed_dir_name, + logger=logger, + ) + + from unstructured.ingest.connector.wikipedia import ( + SimpleWikipediaConfig, + WikipediaSourceConnector, + ) + + source_doc_connector = WikipediaSourceConnector( # type: ignore + connector_config=SimpleWikipediaConfig( + title=page_title, + auto_suggest=auto_suggest, + ), + read_config=self.read_config, + processor_config=self.processor_config, + ) + + self.process_documents(source_doc_connector=source_doc_connector) diff --git a/unstructured/ingest/runner/writers.py b/unstructured/ingest/runner/writers.py index 7be5073c0f..b47a806b2f 100644 --- a/unstructured/ingest/runner/writers.py +++ b/unstructured/ingest/runner/writers.py @@ -60,7 +60,6 @@ def delta_table_writer( table_uri: t.Union[str, Path], write_column: str, mode: t.Literal["error", "append", "overwrite", "ignore"] = "error", - verbose: bool = False, **kwargs, ): from unstructured.ingest.connector.delta_table import ( @@ -73,7 +72,6 @@ def delta_table_writer( write_config=DeltaTableWriteConfig(write_column=write_column, mode=mode), connector_config=SimpleDeltaTableConfig( table_uri=table_uri, - verbose=verbose, ), )