diff --git a/create_es.sh b/create_es.sh index 44510f42b..42aaa7a48 100755 --- a/create_es.sh +++ b/create_es.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +set -e # create indices named "nyaa" and "sukebei", these are hardcoded curl -v -XPUT 'localhost:9200/nyaa?pretty' -H"Content-Type: application/yaml" --data-binary @es_mapping.yml diff --git a/es_mapping.yml b/es_mapping.yml index 14983d522..28462f6bb 100644 --- a/es_mapping.yml +++ b/es_mapping.yml @@ -10,7 +10,6 @@ settings: char_filter: - my_char_filter filter: - - standard - lowercase my_index_analyzer: type: custom @@ -52,7 +51,7 @@ settings: filter: my_ngram: - type: edgeNGram + type: edge_ngram min_gram: 1 max_gram: 15 fullword_min: @@ -66,9 +65,13 @@ settings: type: pattern_capture patterns: ["0*([0-9]*)"] word_delimit: - type: word_delimiter + type: word_delimiter_graph preserve_original: true split_on_numerics: false + # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-word-delimiter-graph-tokenfilter.html#word-delimiter-graph-tokenfilter-configure-parms + # since we're using "trim" filters downstream, otherwise + # you get weird lucene errors about startOffset + adjust_offsets: false char_filter: my_char_filter: type: mapping @@ -78,70 +81,65 @@ settings: # plus replicas don't really help either. number_of_shards: 1 number_of_replicas : 0 - mapper: - # disable elasticsearch's "helpful" autoschema - dynamic: false - # since we disabled the _all field, default query the - # name of the torrent. query: default_field: display_name mappings: - torrent: - # don't want everything concatenated - _all: - enabled: false - properties: - id: - type: long - display_name: - # TODO could do a fancier tokenizer here to parse out the - # the scene convention of stuff in brackets, plus stuff like k-on - type: text - analyzer: my_index_analyzer - fielddata: true # Is this required? - fields: - # Multi-field for full-word matching (when going over ngram limits) - # Note: will have to be queried for, not automatic - fullword: - type: text - analyzer: my_fullword_index_analyzer - # Stored for exact phrase matching - exact: - type: text - analyzer: exact_analyzer - created_time: - type: date - # Only in the ES index for generating magnet links - info_hash: - enabled: false - filesize: - type: long - anonymous: - type: boolean - trusted: - type: boolean - remake: - type: boolean - complete: - type: boolean - hidden: - type: boolean - deleted: - type: boolean - has_torrent: - type: boolean - download_count: - type: long - leech_count: - type: long - seed_count: - type: long - comment_count: - type: long - # these ids are really only for filtering, thus keyword - uploader_id: - type: keyword - main_category_id: - type: keyword - sub_category_id: - type: keyword \ No newline at end of file + # disable elasticsearch's "helpful" autoschema + dynamic: false + properties: + id: + type: long + display_name: + # TODO could do a fancier tokenizer here to parse out the + # the scene convention of stuff in brackets, plus stuff like k-on + type: text + analyzer: my_index_analyzer + fielddata: true # Is this required? + fields: + # Multi-field for full-word matching (when going over ngram limits) + # Note: will have to be queried for, not automatic + fullword: + type: text + analyzer: my_fullword_index_analyzer + # Stored for exact phrase matching + exact: + type: text + analyzer: exact_analyzer + created_time: + type: date + # + # Only in the ES index for generating magnet links + info_hash: + type: keyword + index: false + filesize: + type: long + anonymous: + type: boolean + trusted: + type: boolean + remake: + type: boolean + complete: + type: boolean + hidden: + type: boolean + deleted: + type: boolean + has_torrent: + type: boolean + download_count: + type: long + leech_count: + type: long + seed_count: + type: long + comment_count: + type: long + # these ids are really only for filtering, thus keyword + uploader_id: + type: keyword + main_category_id: + type: keyword + sub_category_id: + type: keyword diff --git a/import_to_es.py b/import_to_es.py index c244abb53..671710087 100755 --- a/import_to_es.py +++ b/import_to_es.py @@ -34,7 +34,6 @@ def pad_bytes(in_bytes, size): def mk_es(t, index_name): return { "_id": t.id, - "_type": "torrent", "_index": index_name, "_source": { # we're also indexing the id as a number so you can diff --git a/nyaa/templates/search_results.html b/nyaa/templates/search_results.html index 25b7142df..76ac1318b 100644 --- a/nyaa/templates/search_results.html +++ b/nyaa/templates/search_results.html @@ -17,7 +17,7 @@ {% endif %} {% endif %} -{% if (use_elastic and torrent_query.hits.total > 0) or (torrent_query.items) %} +{% if (use_elastic and torrent_query.hits.total.value > 0) or (torrent_query.items) %}
diff --git a/nyaa/views/main.py b/nyaa/views/main.py index 8dfe38f88..a1dae5ac3 100644 --- a/nyaa/views/main.py +++ b/nyaa/views/main.py @@ -167,7 +167,7 @@ def home(rss): else: rss_query_string = _generate_query_string( search_term, category, quality_filter, user_name) - max_results = min(max_search_results, query_results['hits']['total']) + max_results = min(max_search_results, query_results['hits']['total']['value']) # change p= argument to whatever you change page_parameter to or pagination breaks pagination = Pagination(p=query_args['page'], per_page=results_per_page, total=max_results, bs_version=3, page_parameter='p', diff --git a/sync_es.py b/sync_es.py index 382c74481..aa1adcbd9 100755 --- a/sync_es.py +++ b/sync_es.py @@ -114,7 +114,6 @@ def reindex_torrent(t, index_name): return { '_op_type': 'update', '_index': index_name, - '_type': 'torrent', '_id': str(t['id']), "doc": doc, "doc_as_upsert": True @@ -128,7 +127,6 @@ def reindex_stats(s, index_name): return { '_op_type': 'update', '_index': index_name, - '_type': 'torrent', '_id': str(s['torrent_id']), "doc": { "stats_last_updated": s["last_updated"], @@ -141,7 +139,6 @@ def delet_this(row, index_name): return { "_op_type": 'delete', '_index': index_name, - '_type': 'torrent', '_id': str(row['values']['id'])} # we could try to make this script robust to errors from es or mysql, but since