diff --git a/.gitignore b/.gitignore index f6b3580..6273790 100644 --- a/.gitignore +++ b/.gitignore @@ -116,7 +116,7 @@ env.bak/ venv.bak/ # secrets file in TOML format -secrets.toml +# secrets.toml # Spyder project settings .spyderproject diff --git a/Dockerfile b/Dockerfile index 5852e3f..2849e41 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,6 +12,11 @@ RUN apt-get update RUN apt-get install -y vim RUN apt install -y unixodbc +RUN apt-get install -y python3-pip +RUN pip3 install --upgrade pip +RUN pip install psycopg2-binary + + RUN wget http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.0g-2ubuntu4_amd64.deb RUN dpkg -i libssl1.1_1.1.0g-2ubuntu4_amd64.deb diff --git a/README.md b/README.md index 33587f4..fdb674f 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,7 @@ For our example we would index the content of https://www.paulgraham.com website * On Linux, ensure that the `run.sh` file is executable by running the following command: ```bash - cmhod +x run.sh + chmod +x run.sh ``` * On Windows, ensure that you run this command from within the WSL 2 environment. diff --git a/config/e-commerce.yaml b/config/e-commerce.yaml new file mode 100644 index 0000000..46df23e --- /dev/null +++ b/config/e-commerce.yaml @@ -0,0 +1,18 @@ +vectara: + corpus_id: 2 + customer_id: 2252938633 + reindex: true + +crawling: + crawler_type: website + +website_crawler: + root_domain: https://www.ozon.ru + urls: + [ + 'https://www.ozon.ru/search/?from_global=true&sorting=new&text=%D1%82%D0%BE%D0%B2%D0%B0%D1%80%D1%8B+%D0%B4%D0%BB%D1%8F+%D0%B4%D0%B5%D0%B2%D0%BE%D1%87%D0%B5%D0%BA' + ] + max_depth: 6 + delay: 1 + pages_source: crawl + extraction: playwright diff --git a/config/jscamp.yaml b/config/jscamp.yaml new file mode 100644 index 0000000..7c2a355 --- /dev/null +++ b/config/jscamp.yaml @@ -0,0 +1,16 @@ +vectara: + corpus_id: 4 + customer_id: 2252938633 + reindex: false + +crawling: + crawler_type: docs + +docs_crawler: + base_urls: ['https://www.jscamp.app'] + pos_regex: ['https://www.jscamp.app/docs.*'] + docs_repo: 'https://github.com/gHashTag/jscamp' + docs_homepage: 'https://www.jscamp.app/docs' + docs_system: 'docusaurus' + neg_regex: [] + extensions_to_ignore: ['md', 'rst', 'ipynb', 'txt'] diff --git a/config/supabase.yaml b/config/supabase.yaml new file mode 100644 index 0000000..4558904 --- /dev/null +++ b/config/supabase.yaml @@ -0,0 +1,14 @@ +vectara: + corpus_id: 6 + customer_id: 2252938633 + reindex: true + +crawling: + crawler_type: database + +database_crawler: + db_url: 'postgresql+psycopg2://postgres.xwnkrqjxeyyfposbfpez:zizdu3-dukhud-xyMxeg@aws-0-eu-central-1.pooler.supabase.com:5432/postgres' + db_table: ecommerce + doc_id_columns: [product] + text_columns: [product] + metadata_columns: [category, author, usd, rub, url, image_url] diff --git a/e-commerce-dev-vectara.pem b/e-commerce-dev-vectara.pem new file mode 100644 index 0000000..5231027 --- /dev/null +++ b/e-commerce-dev-vectara.pem @@ -0,0 +1,27 @@ +-----BEGIN RSA PRIVATE KEY----- +MIIEpQIBAAKCAQEAtbgB5Re0yLVb2qxUkgtDk25xQEnnO6ydTKt4B2WQ6Bntlypl +9DnmYmMBi3DuI4ISFrNXW5GT2vJEtiN9K+IKbUi5pF7V4fJphO506vR6RLqSLNGB +2QxCzdTS6OkOdPBbZ7H9SxturFODhZZBIPZfSn7ZQQj3+m4/5ZCwmFEmYfXK3aV+ +tO8FC4gMT3Suv5l+pbl0PRdUPX+StK5tlNWmOaQ53ow0IlGp9K5xcEcZZ/Awf7+A +IfhDQAtHk42q6q0H9eLxQ4ECNJ/gwANlDgAAlpNHwo9rh8JrUTDV+2mWG/mUMch+ ++fRJJLPcxkNdVJADrJVo1oKOO8g635Cae+0bNwIDAQABAoIBAQCnaUtNnKZv1i02 +a1dKaNgrxi7N1IgUQPV1we7FmbuDKQfrISNmecUW1TGCHls2GUnC4hxT/u0hY9sL +lexmdku23K4ykTAoheTSk8DhCLRwBxZN9gHTTWlqKWzZKISqFRmQkbjR7gbq+dGM +S53a2T926SALu1CBxfpV6xPWb8SVr4YpOOdrTDGkasP0Pyl/k1H/a7h9Z3hliCdA +s/IF4IAA90nwtDIV555sxeANgIckAoja2JlAz9/JVAyMCLPJDofoXdq1M1NoHk/3 +C6pkeu/sBDMf1Mtrw5e12fqXb9YcqU8V6wlx+Jxz+2V03fD5pixiwzS+RQSzfNlf +Y1sRzPfRAoGBAOnxirweUmkztD/lUtG+8Up5e9z+d8gA+VhXAFxmVXkXgxl9uTq/ +WPNAGPkFEisA2uWJ32+uTcjBECr+YrzPiQjEq/gYPiNrccrHBsaF8cPlxMZcHVCH +g9M2bdR88f1y2Ihoz8mUeyrckeCd/EnucjEaHF7QQky5cmpqJXUXIemZAoGBAMbZ ++Dwv6VhRbfXCTq5yvWw1yCalXzb5R4kwSQ/5V13XO3udd/eXt/0B8jNSXKuYNIrY ++BVl76opWZ0l/BU0Tcc4WPPbIyb8w+6lFR7Z2ToArxlYeI/ctonIuiAWmPNFqi+9 +T5kOVyH308r2pGhq2QT+q+eTkVv14uOVnCZ8q01PAoGBAMQF8pvCRIeGHWjUvNqn +RgODW7hHORLLTiMpc1H5JXF7tAwt/oEtSFk6wjIWf6ys4L4pEqG/ycmHjAaadR2E +QGyqbvZJSfff1c91iUXbgkMFSiZzlNSfWGQoXUsJDrOyAaH7QPDFi5bUjx9JAdbp +B6IBIPiJRKSJV0e3fX9pTK8xAoGBAKPdPzip6srC37Bj3N7Y4svnTfa61EnDBnZA ++hRVzu8fzH7Ddvbraczu653rQz6MgM/imB1aETPSO2zKvn7iyS1ge70IWmFobjWc +iFkSAHZR1fk29LLomOvWI7sJpCTkxaftg0iHjroJNdjNgxIg6pG5JAQDkQeNCtms +ki8NEUT1AoGAd0NU2o6pOnZcs/1SQOnd7X7BKgnkTtzkL2gvzzQ1NDAw24VsQsZi +OiueiXidblA8vFeJJQR2FXkq0qClaUUknRoMWGIy66sWvGNbBJY/BBud5xeBMYqD +poy4LuXsAnLIqabGG0mnkU9EkIF8Nzvz23aO80bI2HshInQXuvyIzE8= +-----END RSA PRIVATE KEY----- \ No newline at end of file diff --git a/run.sh b/run.sh old mode 100644 new mode 100755 diff --git a/secrets.example.toml b/secrets.example.toml deleted file mode 100644 index b26dbf6..0000000 --- a/secrets.example.toml +++ /dev/null @@ -1,2 +0,0 @@ -[default] -api_key="...vectara-api-key..."