diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 00000000..9c6fb0d2 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,2 @@ +patreon: medcl +custom: ["https://www.buymeacoffee.com/medcl"] diff --git a/README.md b/README.md index 617e0804..f1406443 100644 --- a/README.md +++ b/README.md @@ -10,16 +10,9 @@ Versions IK version | ES version -----------|----------- -master | 6.x -> master -6.3.0| 6.3.0 -6.2.4| 6.2.4 -6.1.3| 6.1.3 -5.6.8| 5.6.8 -5.5.3| 5.5.3 -5.4.3| 5.4.3 -5.3.3| 5.3.3 -5.2.2| 5.2.2 -5.1.2| 5.1.2 +master | 7.x -> master +6.x| 6.x +5.x| 5.x 1.10.6 | 2.4.6 1.9.5 | 2.3.5 1.8.1 | 2.2.1 @@ -64,13 +57,13 @@ curl -XPUT http://localhost:9200/index 2.create a mapping ```bash -curl -XPOST http://localhost:9200/index/fulltext/_mapping -H 'Content-Type:application/json' -d' +curl -XPOST http://localhost:9200/index/_mapping -H 'Content-Type:application/json' -d' { "properties": { "content": { "type": "text", "analyzer": "ik_max_word", - "search_analyzer": "ik_max_word" + "search_analyzer": "ik_smart" } } @@ -80,25 +73,25 @@ curl -XPOST http://localhost:9200/index/fulltext/_mapping -H 'Content-Type:appli 3.index some docs ```bash -curl -XPOST http://localhost:9200/index/fulltext/1 -H 'Content-Type:application/json' -d' +curl -XPOST http://localhost:9200/index/_create/1 -H 'Content-Type:application/json' -d' {"content":"美国留给伊拉克的是个烂摊子吗"} ' ``` ```bash -curl -XPOST http://localhost:9200/index/fulltext/2 -H 'Content-Type:application/json' -d' +curl -XPOST http://localhost:9200/index/_create/2 -H 'Content-Type:application/json' -d' {"content":"公安部:各地校车将享最高路权"} ' ``` ```bash -curl -XPOST http://localhost:9200/index/fulltext/3 -H 'Content-Type:application/json' -d' +curl -XPOST http://localhost:9200/index/_create/3 -H 'Content-Type:application/json' -d' {"content":"中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"} ' ``` ```bash -curl -XPOST http://localhost:9200/index/fulltext/4 -H 'Content-Type:application/json' -d' +curl -XPOST http://localhost:9200/index/_create/4 -H 'Content-Type:application/json' -d' {"content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"} ' ``` @@ -106,7 +99,7 @@ curl -XPOST http://localhost:9200/index/fulltext/4 -H 'Content-Type:application/ 4.query with highlighting ```bash -curl -XPOST http://localhost:9200/index/fulltext/_search -H 'Content-Type:application/json' -d' +curl -XPOST http://localhost:9200/index/_search -H 'Content-Type:application/json' -d' { "query" : { "match" : { "content" : "中国" }}, "highlight" : { @@ -248,13 +241,13 @@ curl -XGET "http://localhost:9200/your_index/_analyze" -H 'Content-Type: applica 4. ik_max_word 和 ik_smart 什么区别? -ik_max_word: 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合; +ik_max_word: 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合,适合 Term Query; -ik_smart: 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”。 +ik_smart: 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”,适合 Phrase 查询。 Changes ------ -*5.0.0* +*自 v5.0.0 起* - 移除名为 `ik` 的analyzer和tokenizer,请分别使用 `ik_smart` 和 `ik_max_word` diff --git a/config/IKAnalyzer.cfg.xml b/config/IKAnalyzer.cfg.xml index fe69bb20..0c39dac6 100644 --- a/config/IKAnalyzer.cfg.xml +++ b/config/IKAnalyzer.cfg.xml @@ -10,4 +10,12 @@ + + + + root + + root + + 10 diff --git a/mysql.extend.md b/mysql.extend.md new file mode 100644 index 00000000..2d6b3688 --- /dev/null +++ b/mysql.extend.md @@ -0,0 +1,75 @@ +IK Analysis 扩展词新增mysql同步来源 +============================= + +- 支持启动全量加载扩展词 +- 支持热更新扩展词 + +> mysql 扩展词表结构 + + +```mysql + +CREATE TABLE `es_lexicon` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '词库id', + `create_date` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', + `modify_date` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '修改时间', + `lexicon_text` varchar(40) NOT NULL COMMENT '词条关键词', + `lexicon_type` tinyint(1) NOT NULL DEFAULT '0' COMMENT '0扩展词库 1停用词库', + `lexicon_status` tinyint(1) NOT NULL DEFAULT '0' COMMENT '词条状态 0正常 1暂停使用', + `del_flag` tinyint(1) NOT NULL DEFAULT '0' COMMENT '作废标志 0正常 1作废', + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='ES远程扩展词库表' +``` + + + +```IKAnalyzer.cfg.xml``` + + +```xml + + + + IK Analyzer 扩展配置 + + + + + + + + + + + + root + + 123456 + + 10 + + +``` + +> 自行打包 放入elasticsearch plugins 目录即可 + +启动日志如下: + +``` +[2021-06-02T15:16:07,593][INFO ][o.w.a.d.Dictionary ] ======start mysql to reload ik dict.====== +[2021-06-02T15:16:07,828][INFO ][o.w.a.d.Dictionary ] last update mysql ext dic time :2021-05-27T14:36:05.000+0800,fill count:4843 ,disable count:0 +[2021-06-02T15:16:07,837][INFO ][o.w.a.d.Dictionary ] the last reload stop word not found, the last update time :null +[2021-06-02T15:16:07,838][INFO ][o.w.a.d.Dictionary ] last update mysql stop word time :null,fill count:0 ,disable count:0 +[2021-06-02T15:16:07,838][INFO ][o.w.a.d.Dictionary ] ======reload mysql ik dict finished.====== +[2021-06-02T15:16:17,587][INFO ][o.w.a.d.Dictionary ] ======start mysql to reload ik dict.====== +[2021-06-02T15:16:17,615][INFO ][o.w.a.d.Dictionary ] last update mysql ext dic time :2021-06-01T09:44:50.000+0800,fill count:4842 ,disable count:0 +[2021-06-02T15:16:17,623][INFO ][o.w.a.d.Dictionary ] the last reload stop word not found, the last update time :null +[2021-06-02T15:16:17,624][INFO ][o.w.a.d.Dictionary ] last update mysql stop word time :null,fill count:0 ,disable count:0 +[2021-06-02T15:16:17,624][INFO ][o.w.a.d.Dictionary ] ======reload mysql ik dict finished.====== +[2021-06-02T15:16:27,596][INFO ][o.w.a.d.Dictionary ] ======start mysql to reload ik dict.====== +[2021-06-02T15:16:27,602][INFO ][o.w.a.d.Dictionary ] the latest update record was not found, the last update time :2021-06-01T09:44:50.000+0800 +[2021-06-02T15:16:27,602][INFO ][o.w.a.d.Dictionary ] last update mysql ext dic time :2021-06-01T09:44:50.000+0800,fill count:0 ,disable count:0 +[2021-06-02T15:16:27,608][INFO ][o.w.a.d.Dictionary ] the last reload stop word not found, the last update time :null +[2021-06-02T15:16:27,608][INFO ][o.w.a.d.Dictionary ] last update mysql stop word time :null,fill count:0 ,disable count:0 +[2021-06-02T15:16:27,608][INFO ][o.w.a.d.Dictionary ] ======reload mysql ik dict finished.====== +``` \ No newline at end of file diff --git a/pom.xml b/pom.xml index 32fb4e14..7ab39feb 100644 --- a/pom.xml +++ b/pom.xml @@ -12,7 +12,7 @@ 2011 - 6.5.0 + 6.2.3 1.8 ${project.basedir}/src/main/assemblies/plugin.xml analysis-ik @@ -95,6 +95,11 @@ log4j-api 2.3 + + mysql + mysql-connector-java + 5.1.49 + org.hamcrest diff --git a/src/main/assemblies/plugin.xml b/src/main/assemblies/plugin.xml index 8b6fa594..9bc850c8 100644 --- a/src/main/assemblies/plugin.xml +++ b/src/main/assemblies/plugin.xml @@ -39,6 +39,7 @@ true org.apache.httpcomponents:httpclient + mysql:mysql-connector-java diff --git a/src/main/java/org/elasticsearch/index/analysis/IkTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/IkTokenizerFactory.java index 17d09806..51773ddd 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IkTokenizerFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/IkTokenizerFactory.java @@ -11,7 +11,7 @@ public class IkTokenizerFactory extends AbstractTokenizerFactory { private Configuration configuration; public IkTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { - super(indexSettings, name, settings); + super(indexSettings, name,settings); configuration=new Configuration(env,settings); } diff --git a/src/main/java/org/wltea/analyzer/cfg/Configuration.java b/src/main/java/org/wltea/analyzer/cfg/Configuration.java index dadd0f20..78e9f5d4 100644 --- a/src/main/java/org/wltea/analyzer/cfg/Configuration.java +++ b/src/main/java/org/wltea/analyzer/cfg/Configuration.java @@ -24,6 +24,9 @@ public class Configuration { //是否启用远程词典加载 private boolean enableRemoteDict=false; + //是否启用远程词典加载 + private boolean enableMysqlDict=false; + //是否启用小写处理 private boolean enableLowercase=true; @@ -36,6 +39,7 @@ public Configuration(Environment env,Settings settings) { this.useSmart = settings.get("use_smart", "false").equals("true"); this.enableLowercase = settings.get("enable_lowercase", "true").equals("true"); this.enableRemoteDict = settings.get("enable_remote_dict", "true").equals("true"); + this.enableMysqlDict = settings.get("enable_mysql_dict", "true").equals("true"); Dictionary.initial(this); @@ -69,6 +73,10 @@ public boolean isEnableRemoteDict() { return enableRemoteDict; } + public boolean isEnableMysqlDict() { + return enableMysqlDict; + } + public boolean isEnableLowercase() { return enableLowercase; } diff --git a/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java b/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java index 5bf1ac90..890d9080 100644 --- a/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java +++ b/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java @@ -268,13 +268,13 @@ void outputToResult(){ while(l != null){ this.results.add(l); //字典中无单字,但是词元冲突了,切分出相交词元的前一个词元中的单字 - int innerIndex = index + 1; + /*int innerIndex = index + 1; for (; innerIndex < index + l.getLength(); innerIndex++) { Lexeme innerL = path.peekFirst(); if (innerL != null && innerIndex == innerL.getBegin()) { this.outputSingleCJK(innerIndex - 1); } - } + }*/ //将index移至lexeme后 index = l.getBegin() + l.getLength(); diff --git a/src/main/java/org/wltea/analyzer/db/DBConfigProperties.java b/src/main/java/org/wltea/analyzer/db/DBConfigProperties.java new file mode 100644 index 00000000..eabb6296 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/db/DBConfigProperties.java @@ -0,0 +1,58 @@ +package org.wltea.analyzer.db; + +import java.io.Serializable; + +/** + * @author fsren + * @date 2021-05-25 + */ +public class DBConfigProperties implements Serializable { + + private static final long serialVersionUID = 688310733642302993L; + private String dbUrl; + private String user; + private String password; + private Integer reloadInterval; + + public String getDbUrl() { + return dbUrl; + } + + public void setDbUrl(String dbUrl) { + this.dbUrl = dbUrl; + } + + public String getUser() { + return user; + } + + public void setUser(String user) { + this.user = user; + } + + public String getPassword() { + return password; + } + + public void setPassword(String password) { + this.password = password; + } + + public Integer getReloadInterval() { + return reloadInterval; + } + + public void setReloadInterval(Integer reloadInterval) { + this.reloadInterval = reloadInterval; + } + + @Override + public String toString() { + return "DBConfigProperties{" + + "dbUrl='" + dbUrl + '\'' + + ", user='" + user + '\'' + + ", password='" + password + '\'' + + ", reloadInterval=" + reloadInterval + + '}'; + } +} diff --git a/src/main/java/org/wltea/analyzer/db/DataSourceFactory.java b/src/main/java/org/wltea/analyzer/db/DataSourceFactory.java new file mode 100644 index 00000000..65f9d032 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/db/DataSourceFactory.java @@ -0,0 +1,43 @@ +package org.wltea.analyzer.db; + +import com.mysql.jdbc.jdbc2.optional.MysqlConnectionPoolDataSource; +import org.apache.logging.log4j.Logger; +import org.elasticsearch.SpecialPermission; +import org.wltea.analyzer.help.ESPluginLoggerFactory; + +import javax.sql.DataSource; +import java.security.AccessController; +import java.security.PrivilegedAction; +import java.sql.SQLException; + +/** + * @author fsren + * @date 2021-05-25 + */ +public class DataSourceFactory { + + + private static final Logger logger = ESPluginLoggerFactory.getLogger(DataSourceFactory.class.getName()); + + + public static DataSource getDataSource(DBConfigProperties configProperties) { + + SpecialPermission.check(); + return AccessController.doPrivileged((PrivilegedAction) () -> { + logger.info("load datasource start"); + MysqlConnectionPoolDataSource dataSource = new MysqlConnectionPoolDataSource(); + dataSource.setURL(configProperties.getDbUrl()); + dataSource.setUser(configProperties.getUser()); + dataSource.setPassword(configProperties.getPassword()); + dataSource.setAllowMultiQueries(true); + try { + dataSource.setSocketTimeout(1000); + } catch (SQLException ignore) { + } + logger.info("load datasource end"); + return dataSource; + }); + } + + +} diff --git a/src/main/java/org/wltea/analyzer/db/JdbcUtil.java b/src/main/java/org/wltea/analyzer/db/JdbcUtil.java new file mode 100644 index 00000000..7cea9676 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/db/JdbcUtil.java @@ -0,0 +1,92 @@ +package org.wltea.analyzer.db; + +import org.wltea.analyzer.db.core.*; + +import javax.sql.DataSource; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.List; + +/** + * @author fsren + * @date 2021-05-26 + */ +public class JdbcUtil { + + private final ConnectionManager connManager; + + + /** + * 创建JdbcUtils + * + * @param dataSource 数据源 + */ + public JdbcUtil(DataSource dataSource) { + connManager = new ConnectionManager(dataSource); + } + + /** + * 创建语句 + * + * @param conn 连接 + * @param sql sql语句 + * @param params sql参数 + * @return 创建的PreparedStatement对象 + * @throws SQLException 来自JDBC的异常 + */ + private PreparedStatement createPreparedStatement(Connection conn, String sql, Object... params) throws SQLException { + PreparedStatement stmt = conn.prepareStatement(sql); + for (int i = 0; i < params.length; ++i) { + stmt.setObject(i + 1, params[i]); + } + return stmt; + } + + /** + * 查询数据库并转换结果集。 + * 用户可自定义结果集转换器。 + * 用户也可使用预定义的结果集转换器。 + * + * @param sql sql语句 + * @param recordMapper 结果集转换器 + * @param params sql参数 + * @param resultSetMapper返回的结果类型 + * @return 成功则返回转换结果,失败则抛出DbException,结果为空则返回空列表 + * @see RecordMapper + * @see ListRecordMapper + */ + public T query(String sql, RecordMapper recordMapper, Object... params) { + ResultSet rs = null; + PreparedStatement stmt = null; + Connection conn = null; + try { + conn = connManager.getConnection(); + stmt = createPreparedStatement(conn, sql, params); + rs = stmt.executeQuery(); + return recordMapper.map(new RecordAdapterForResultSet(rs)); + } catch (Exception e) { + throw new RuntimeException(e.getMessage(), e); + } finally { + connManager.close(conn, stmt, rs); + } + } + + /** + * 查询数据库,对结果集的每一行进行转换,然后将所有行封装成列表。 + * 用户可自定义行转换器。 + * 用户也可使用预定义的行转换器。 + * + * @param sql sql语句 + * @param rowMapper 行转换器 + * @param params sql参数 + * @param rowMapper返回的结果类型 + * @return 成功则返回结果列表,失败则抛出DbException,结果为空则返回空列表 + * @see RowMapper + * @see MapRowMapper + */ + public List queryList(String sql, RowMapper rowMapper, Object... params) { + return query(sql, new ListRecordMapper<>(rowMapper), params); + } +} diff --git a/src/main/java/org/wltea/analyzer/db/Lexicon.java b/src/main/java/org/wltea/analyzer/db/Lexicon.java new file mode 100644 index 00000000..6b0233a6 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/db/Lexicon.java @@ -0,0 +1,51 @@ +package org.wltea.analyzer.db; + +import java.io.Serializable; +import java.sql.Timestamp; + +/** + * @author fsren + * @date 2021-05-26 + */ +public class Lexicon implements Serializable { + + private static final long serialVersionUID = 7628519160135272308L; + /** + * 扩展词 + */ + private String lexiconText; + + /** + * 加载状态 true 代表加载, false 代表屏蔽 + */ + private Boolean isFill; + + /** + * 最后更新时间 + */ + private Timestamp modifyDate; + + public String getLexiconText() { + return lexiconText; + } + + public void setLexiconText(String lexiconText) { + this.lexiconText = lexiconText; + } + + public Boolean getFill() { + return isFill; + } + + public void setFill(Boolean fill) { + isFill = fill; + } + + public Timestamp getModifyDate() { + return modifyDate; + } + + public void setModifyDate(Timestamp modifyDate) { + this.modifyDate = modifyDate; + } +} diff --git a/src/main/java/org/wltea/analyzer/db/core/ConnectionManager.java b/src/main/java/org/wltea/analyzer/db/core/ConnectionManager.java new file mode 100644 index 00000000..0071b04b --- /dev/null +++ b/src/main/java/org/wltea/analyzer/db/core/ConnectionManager.java @@ -0,0 +1,151 @@ +package org.wltea.analyzer.db.core; + +import org.elasticsearch.SpecialPermission; + +import javax.sql.DataSource; +import java.security.AccessController; +import java.security.PrivilegedAction; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; + +/** + * 连接管理器 + * 封装了线程安全的数据库连接 + * + * @author fsren + * @date 2021-05-26 + */ +public class ConnectionManager { + + + private final DataSource dataSource; + private final ThreadLocal connHolder = new ThreadLocal<>(); + + public ConnectionManager(DataSource dataSource) { + this.dataSource = dataSource; + } + + public Connection getConnection() { + SpecialPermission.check(); + return AccessController.doPrivileged((PrivilegedAction) () -> { + try { + Connection conn = connHolder.get(); + if (conn == null) { + conn = dataSource.getConnection(); + connHolder.set(conn); + } + return conn; + } catch (SQLException e) { + throw new RuntimeException("An error occurred while creating a database connection.", e); + } + }); + } + + public void close(Connection conn, Statement stmt) { + SpecialPermission.check(); + AccessController.doPrivileged((PrivilegedAction) () -> { + if (stmt != null) { + try { + stmt.close(); + } catch (SQLException ignored) { + } + } + if (conn != null) { + try { + if (conn.getAutoCommit()) { + conn.close(); + connHolder.remove(); + } + } catch (SQLException ignored) { + } + } + return null; + }); + + } + + public void close(Connection conn, Statement stmt, ResultSet rs) { + SpecialPermission.check(); + AccessController.doPrivileged((PrivilegedAction) () -> { + if (rs != null) { + try { + rs.close(); + } catch (SQLException ignored) { + } + } + return null; + }); + close(conn, stmt); + } + + public void startTransaction() { + SpecialPermission.check(); + AccessController.doPrivileged((PrivilegedAction) () -> { + try { + Connection conn = connHolder.get(); + if (conn != null) { + conn.close(); + connHolder.remove(); + } + conn = dataSource.getConnection(); + conn.setAutoCommit(false); + connHolder.set(conn); + } catch (SQLException e) { + throw new RuntimeException("An error occurred while starting transaction.", e); + } + return null; + }); + + } + + public void commit() { + SpecialPermission.check(); + AccessController.doPrivileged((PrivilegedAction) () -> { + Connection conn = connHolder.get(); + if (conn != null) { + try { + conn.commit(); + conn.close(); + connHolder.remove(); + } catch (SQLException e) { + throw new RuntimeException("An error occurred while committing transaction.", e); + } + } + return null; + }); + + } + + public void rollback() { + SpecialPermission.check(); + AccessController.doPrivileged((PrivilegedAction) () -> { + Connection conn = connHolder.get(); + if (conn != null) { + try { + conn.rollback(); + conn.close(); + connHolder.remove(); + } catch (SQLException e) { + throw new RuntimeException("An error occurred while committing transaction.", e); + } + } + return null; + }); + + } + + public boolean inTransaction() { + SpecialPermission.check(); + return AccessController.doPrivileged((PrivilegedAction) () -> { + Connection conn = connHolder.get(); + try { + return conn != null && !conn.getAutoCommit(); + } catch (SQLException e) { + throw new RuntimeException("An error occurred while getting auto commit.", e); + } + }); + + } +} diff --git a/src/main/java/org/wltea/analyzer/db/core/ListRecordMapper.java b/src/main/java/org/wltea/analyzer/db/core/ListRecordMapper.java new file mode 100644 index 00000000..e10103d0 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/db/core/ListRecordMapper.java @@ -0,0 +1,25 @@ +package org.wltea.analyzer.db.core; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author fsren + * @date 2021-05-26 + */ +public class ListRecordMapper implements RecordMapper> { + private final RowMapper rowMapper; + + public ListRecordMapper(RowMapper rowMapper) { + this.rowMapper = rowMapper; + } + + @Override + public List map(Record record) { + List result = new ArrayList<>(); + while (record.next()) { + result.add(rowMapper.map(record.getCurrentRow())); + } + return result; + } +} \ No newline at end of file diff --git a/src/main/java/org/wltea/analyzer/db/core/MapRowMapper.java b/src/main/java/org/wltea/analyzer/db/core/MapRowMapper.java new file mode 100644 index 00000000..b58b02ec --- /dev/null +++ b/src/main/java/org/wltea/analyzer/db/core/MapRowMapper.java @@ -0,0 +1,22 @@ +package org.wltea.analyzer.db.core; + +import java.util.Hashtable; +import java.util.Map; + +/** + * @author fsren + * @date 2021-05-26 + */ +public class MapRowMapper implements RowMapper> { + @Override + public Map map(Row row) { + Map map = new Hashtable<>(); + int count = row.getColumnCount(); + for (int i = 1; i <= count; i++) { + String key = row.getColumnLabel(i); + Object value = row.getObject(i); + map.put(key, value); + } + return map; + } +} diff --git a/src/main/java/org/wltea/analyzer/db/core/Record.java b/src/main/java/org/wltea/analyzer/db/core/Record.java new file mode 100644 index 00000000..18f3aad3 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/db/core/Record.java @@ -0,0 +1,22 @@ +package org.wltea.analyzer.db.core; + +/** + * @author fsren + * @date 2021-05-26 + */ +public interface Record { + + /** + * 获取当前行 + * + * @return 当前行 + */ + Row getCurrentRow(); + + /** + * 移动到下一行 + * + * @return 若当前已到达最后一行,则返回false,否则返回true + */ + boolean next(); +} diff --git a/src/main/java/org/wltea/analyzer/db/core/RecordAdapterForResultSet.java b/src/main/java/org/wltea/analyzer/db/core/RecordAdapterForResultSet.java new file mode 100644 index 00000000..afdbbbb6 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/db/core/RecordAdapterForResultSet.java @@ -0,0 +1,177 @@ +package org.wltea.analyzer.db.core; + +import java.sql.*; + +/** + * 将java.sql.ResultSet转换成自定义的Record,封装底层实现细节 + * + * @author fsren + * @date 2021-05-26 + */ +public class RecordAdapterForResultSet implements Record, Row { + private final ResultSet rs; + + public RecordAdapterForResultSet(ResultSet resultSet) { + this.rs = resultSet; + } + + @Override + public Object getObject(String columnLabel) { + try { + return rs.getObject(columnLabel); + } catch (SQLException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + + @Override + public Object getObject(int columnIndex) { + try { + return rs.getObject(columnIndex); + } catch (SQLException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + + @Override + public int getInt(String columnLabel) { + try { + return rs.getInt(columnLabel); + } catch (SQLException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + + @Override + public int getInt(int columnIndex) { + try { + return rs.getInt(columnIndex); + } catch (SQLException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + + @Override + public String getString(String columnLabel) { + try { + return rs.getString(columnLabel); + } catch (SQLException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + + @Override + public String getString(int columnIndex) { + try { + return rs.getString(columnIndex); + } catch (SQLException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + + @Override + public double getDouble(String columnLabel) { + try { + return rs.getDouble(columnLabel); + } catch (SQLException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + + @Override + public double getDouble(int columnIndex) { + try { + return rs.getDouble(columnIndex); + } catch (SQLException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + + @Override + public Date getDate(int columnIndex) { + try { + return rs.getDate(columnIndex); + } catch (SQLException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + + @Override + public Date getDate(String columnLabel) { + try { + return rs.getDate(columnLabel); + } catch (SQLException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + + @Override + public Boolean getBoolean(int columnIndex) { + try { + return rs.getBoolean(columnIndex); + } catch (SQLException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + + @Override + public Boolean getBoolean(String columnLabel) { + try { + return rs.getBoolean(columnLabel); + } catch (SQLException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + + @Override + public Timestamp getTimestamp(int columnIndex) { + try { + return rs.getTimestamp(columnIndex); + } catch (SQLException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + + @Override + public Timestamp getTimestamp(String columnLabel) { + try { + return rs.getTimestamp(columnLabel); + } catch (SQLException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + + @Override + public int getColumnCount() { + try { + ResultSetMetaData metaData = rs.getMetaData(); + return metaData.getColumnCount(); + } catch (SQLException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + + @Override + public String getColumnLabel(int index) { + try { + ResultSetMetaData metaData = rs.getMetaData(); + return metaData.getColumnLabel(index); + } catch (SQLException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + + @Override + public Row getCurrentRow() { + return this; + } + + @Override + public boolean next() { + try { + return rs.next(); + } catch (SQLException e) { + throw new RuntimeException(e.getMessage(), e); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/wltea/analyzer/db/core/RecordMapper.java b/src/main/java/org/wltea/analyzer/db/core/RecordMapper.java new file mode 100644 index 00000000..975af800 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/db/core/RecordMapper.java @@ -0,0 +1,10 @@ +package org.wltea.analyzer.db.core; + +/** + * @author fsren + * @date 2021-05-26 + */ +public interface RecordMapper { + + T map(Record record); +} diff --git a/src/main/java/org/wltea/analyzer/db/core/Row.java b/src/main/java/org/wltea/analyzer/db/core/Row.java new file mode 100644 index 00000000..73d06431 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/db/core/Row.java @@ -0,0 +1,59 @@ +package org.wltea.analyzer.db.core; + +import java.sql.Date; +import java.sql.Timestamp; + +/** + * * 数据行:封装了结果集的一行数据。 + * * 数据行由若干列组成,每列都是一个值。 + * * Row通过Record的getCurrentRow方法获取。 + * * Row中包含了一系列getXXX方法用于获取列中的值。 + * + * @author fsren + * @date 2021-05-26 + */ +public interface Row { + + Object getObject(String columnLabel); + + Object getObject(int columnIndex); + + int getInt(String columnLabel); + + int getInt(int columnIndex); + + String getString(String columnLabel); + + String getString(int columnIndex); + + double getDouble(String columnLabel); + + double getDouble(int columnIndex); + + Date getDate(int columnIndex); + + Date getDate(String columnLabel); + + Boolean getBoolean(int columnIndex); + + Boolean getBoolean(String columnLabel); + + Timestamp getTimestamp(int columnIndex); + + Timestamp getTimestamp(String columnLabel); + + /** + * 获取列数 + * + * @return 列数 + */ + int getColumnCount(); + + /** + * 获取列标签 + * + * @param index 列索引(从1开始) + * @return 列标签 + */ + String getColumnLabel(int index); +} diff --git a/src/main/java/org/wltea/analyzer/db/core/RowMapper.java b/src/main/java/org/wltea/analyzer/db/core/RowMapper.java new file mode 100644 index 00000000..4dc2d962 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/db/core/RowMapper.java @@ -0,0 +1,11 @@ +package org.wltea.analyzer.db.core; + +/** + * 行转换器接口 + * + * @author fsren + * @date 2021-05-26 + */ +public interface RowMapper { + T map(Row row); +} diff --git a/src/main/java/org/wltea/analyzer/dic/DictSegment.java b/src/main/java/org/wltea/analyzer/dic/DictSegment.java index bc330332..9e7b6fe4 100644 --- a/src/main/java/org/wltea/analyzer/dic/DictSegment.java +++ b/src/main/java/org/wltea/analyzer/dic/DictSegment.java @@ -57,7 +57,7 @@ class DictSegment implements Comparable{ DictSegment(Character nodeChar){ if(nodeChar == null){ - throw new IllegalArgumentException("参数为空异常,字符不能为空"); + throw new IllegalArgumentException("node char cannot be empty"); } this.nodeChar = nodeChar; } diff --git a/src/main/java/org/wltea/analyzer/dic/Dictionary.java b/src/main/java/org/wltea/analyzer/dic/Dictionary.java index b61e0381..6620a7b6 100644 --- a/src/main/java/org/wltea/analyzer/dic/Dictionary.java +++ b/src/main/java/org/wltea/analyzer/dic/Dictionary.java @@ -38,10 +38,12 @@ import java.nio.file.SimpleFileVisitor; import java.security.AccessController; import java.security.PrivilegedAction; +import java.sql.Timestamp; import java.util.*; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.http.Header; import org.apache.http.HttpEntity; @@ -56,6 +58,10 @@ import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin; import org.wltea.analyzer.cfg.Configuration; import org.apache.logging.log4j.Logger; +import org.wltea.analyzer.db.DBConfigProperties; +import org.wltea.analyzer.db.DataSourceFactory; +import org.wltea.analyzer.db.JdbcUtil; +import org.wltea.analyzer.db.Lexicon; import org.wltea.analyzer.help.ESPluginLoggerFactory; @@ -80,9 +86,9 @@ public class Dictionary { */ private Configuration configuration; - private static final Logger logger = ESPluginLoggerFactory.getLogger(Monitor.class.getName()); + private static final Logger logger = ESPluginLoggerFactory.getLogger(Dictionary.class.getName()); - private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1); + private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(2); private static final String PATH_DIC_MAIN = "main.dic"; private static final String PATH_DIC_SURNAME = "surname.dic"; @@ -97,9 +103,19 @@ public class Dictionary { private final static String EXT_STOP = "ext_stopwords"; private final static String REMOTE_EXT_STOP = "remote_ext_stopwords"; + private final static String DB_URL = "db_url"; + private final static String DB_USER = "db_user"; + private final static String DB_PASSWORD = "db_password"; + private final static String DB_RELOAD_INTERVAL = "db_reload_interval"; + private Path conf_dir; private Properties props; + private static DBConfigProperties configProperties; + private static JdbcUtil jdbcUtil; + private Timestamp lastReloadDate; + private Timestamp lastReloadStopWordDate; + private Dictionary(Configuration cfg) { this.configuration = cfg; this.props = new Properties(); @@ -128,6 +144,7 @@ private Dictionary(Configuration cfg) { logger.error("ik-analyzer", e); } } + loadJdbcConfig(); } private String getProperty(String key){ @@ -136,6 +153,41 @@ private String getProperty(String key){ } return null; } + private void loadJdbcConfig(){ + + String dbUrl = getProperty(DB_URL); + if(dbUrl == null || dbUrl.trim().equals("")){ + logger.info("db synchronization is not turned on, ignore detection"); + return; + } + logger.info("init db config start"); + configProperties = new DBConfigProperties(); + configProperties.setDbUrl(checkDefault(getProperty(DB_URL),"jdbc:mysql://127.0.0.1:3306/post_bar")); + configProperties.setUser(checkDefault(getProperty(DB_USER),"root")); + configProperties.setPassword(checkDefault(getProperty(DB_PASSWORD),"root")); + configProperties.setReloadInterval(checkDefault(getProperty(DB_RELOAD_INTERVAL),60)); + jdbcUtil = new JdbcUtil(DataSourceFactory.getDataSource(configProperties)); + logger.info("db config {}",configProperties.toString()); + logger.info("init db config end"); + } + private String checkDefault(String value,String defaultValue){ + if(value == null || "".equals(value.trim())){ + return defaultValue; + } + return value.trim(); + } + + private Integer checkDefault(String value,int defaultValue){ + if(value == null || "".equals(value.trim())){ + return defaultValue; + } + value = value.trim(); + try { + return Integer.parseInt(value); + }catch(Exception e){ + return defaultValue; + } + } /** * 词典初始化 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化 * 只有当Dictionary类被实际调用时,才会开始载入词典, 这将延长首次分词操作的时间 该方法提供了一个在应用加载阶段就初始化字典的手段 @@ -155,6 +207,12 @@ public static synchronized void initial(Configuration cfg) { singleton.loadPrepDict(); singleton.loadStopWordDict(); + //启动线程 + if(cfg.isEnableMysqlDict()){ + // 10 秒是初始延迟可以修改的 ReloadInterval是间隔时间 单位秒 + pool.scheduleAtFixedRate(new MysqlMonitor(), 10, configProperties.getReloadInterval(), TimeUnit.SECONDS); + } + if(cfg.isEnableRemoteDict()){ // 建立监控线程 for (String location : singleton.getRemoteExtDictionarys()) { @@ -294,7 +352,7 @@ private String getDictRoot() { */ public static Dictionary getSingleton() { if (singleton == null) { - throw new IllegalStateException("词典尚未初始化,请先调用initial方法"); + throw new IllegalStateException("ik dict has not been initialized yet, please call initial method first."); } return singleton; } @@ -419,7 +477,7 @@ private void loadRemoteExtDict() { List lists = getRemoteWords(location); // 如果找不到扩展的字典,则忽略 if (lists == null) { - logger.error("[Dict Loading] " + location + "加载失败"); + logger.error("[Dict Loading] " + location + " load failed"); continue; } for (String theWord : lists) { @@ -469,7 +527,7 @@ private static List getRemoteWordsUnprivileged(String location) { } } - if (entity.getContentLength() > 0) { + if (entity.getContentLength() > 0 || entity.isChunked()) { in = new BufferedReader(new InputStreamReader(entity.getContent(), charset)); String line; while ((line = in.readLine()) != null) { @@ -518,7 +576,7 @@ private void loadStopWordDict() { List lists = getRemoteWords(location); // 如果找不到扩展的字典,则忽略 if (lists == null) { - logger.error("[Dict Loading] " + location + "加载失败"); + logger.error("[Dict Loading] " + location + " load failed"); continue; } for (String theWord : lists) { @@ -562,7 +620,7 @@ private void loadPrepDict() { } void reLoadMainDict() { - logger.info("重新加载词典..."); + logger.info("start to reload ik dict."); // 新开一个实例加载词典,减少加载过程对当前词典使用的影响 Dictionary tmpDict = new Dictionary(configuration); tmpDict.configuration = getSingleton().configuration; @@ -570,7 +628,101 @@ void reLoadMainDict() { tmpDict.loadStopWordDict(); _MainDict = tmpDict._MainDict; _StopWords = tmpDict._StopWords; - logger.info("重新加载词典完毕..."); + logger.info("reload ik dict finished."); + } + + void reLoadMysqlMainDict() { + if(jdbcUtil == null){ + logger.info("mysql parameters are not configured, mysql synchronization is ignored"); + return; + } + logger.info("======start mysql to reload ik dict.======"); + List mainDictList = loadMySQLExtDict(); + + AtomicInteger mainDictFillSegmentCount = new AtomicInteger(); + AtomicInteger mainDictDisableSegmentCount = new AtomicInteger(); + mainDictList.forEach(lexicon -> { + if(lexicon.getFill()){ + singleton._MainDict.fillSegment(lexicon.getLexiconText().trim().toLowerCase().toCharArray()); + mainDictFillSegmentCount.getAndIncrement(); + }else { + singleton._MainDict.disableSegment(lexicon.getLexiconText().trim().toLowerCase().toCharArray()); + mainDictDisableSegmentCount.getAndIncrement(); + } + }); + logger.info("last update mysql ext dic time :{},fill count:{} ,disable count:{} " ,singleton.lastReloadDate,mainDictFillSegmentCount,mainDictDisableSegmentCount); + + List stopWordDictList = loadMySQLStopWordDict(); + AtomicInteger stopWordDictFillSegmentCount = new AtomicInteger(); + AtomicInteger stopWordDictDisableSegmentCount = new AtomicInteger(); + stopWordDictList.forEach(lexicon -> { + if(lexicon.getFill()){ + singleton._StopWords.fillSegment(lexicon.getLexiconText().trim().toLowerCase().toCharArray()); + stopWordDictFillSegmentCount.getAndIncrement(); + }else { + singleton._StopWords.disableSegment(lexicon.getLexiconText().trim().toLowerCase().toCharArray()); + stopWordDictDisableSegmentCount.getAndIncrement(); + } + }); + logger.info("last update mysql stop word time :{},fill count:{} ,disable count:{} " ,singleton.lastReloadStopWordDate,stopWordDictFillSegmentCount,stopWordDictDisableSegmentCount); + logger.info("======reload mysql ik dict finished.======"); + } + /** + * 从MySql中加载动态词库 + */ + private List loadMySQLExtDict(){ + + String commandSql = "SELECT lexicon_text ,modify_date,lexicon_status,del_flag FROM es_lexicon WHERE lexicon_type = 0 ORDER BY modify_date desc"; + LinkedList params = new LinkedList<>(); + if(singleton.lastReloadDate != null){ + commandSql = "SELECT lexicon_text ,modify_date,lexicon_status,del_flag FROM es_lexicon WHERE lexicon_type = 0 AND modify_date > ? ORDER BY modify_date ASC"; + params.add(singleton.lastReloadDate); + } + List lexiconList = loadLexicon(commandSql,params.toArray()); + if(lexiconList == null || lexiconList.size() == 0){ + logger.info("the latest update record was not found, the last update time :{} " ,singleton.lastReloadDate); + return Collections.emptyList(); + } + singleton.lastReloadDate = lexiconList.get(lexiconList.size() - 1).getModifyDate(); + return lexiconList; + + } + + + /** + * 从MySql中加载远程停用词库 + */ + private List loadMySQLStopWordDict(){ + + String commandSql = "SELECT lexicon_text ,modify_date,lexicon_status,del_flag FROM es_lexicon WHERE lexicon_type = 1 ORDER BY modify_date ASC"; + LinkedList params = new LinkedList<>(); + if(singleton.lastReloadStopWordDate != null){ + commandSql = "SELECT lexicon_text ,modify_date,lexicon_status,del_flag FROM es_lexicon WHERE lexicon_type = 1 AND modify_date > ? ORDER BY modify_date ASC"; + params.add(singleton.lastReloadStopWordDate); + } + List lexiconList = loadLexicon(commandSql,params.toArray()); + if(lexiconList == null || lexiconList.size() == 0){ + logger.info("the last reload stop word not found, the last update time :{} " ,singleton.lastReloadStopWordDate); + return Collections.emptyList(); + } + singleton.lastReloadStopWordDate = lexiconList.get(lexiconList.size() - 1).getModifyDate(); + return lexiconList; + } + + private List loadLexicon(String commandSql,Object[] param){ + return jdbcUtil.queryList(commandSql, row -> { + String lexiconText = row.getString("lexicon_text"); + Timestamp modifyDate = row.getTimestamp("modify_date"); + //词条状态 0正常 1暂停使用 + int status = row.getInt("lexicon_status"); + //删除状态 0正常 1暂停使用 + int deleteStatus = row.getInt("del_flag"); + Lexicon lexicon = new Lexicon(); + lexicon.setLexiconText(lexiconText); + lexicon.setModifyDate(modifyDate); + lexicon.setFill(status == 0 && deleteStatus == 0); + return lexicon; + },param); } } diff --git a/src/main/java/org/wltea/analyzer/dic/MysqlMonitor.java b/src/main/java/org/wltea/analyzer/dic/MysqlMonitor.java new file mode 100644 index 00000000..5cdbba58 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/dic/MysqlMonitor.java @@ -0,0 +1,13 @@ +package org.wltea.analyzer.dic; + +/** + * @author fsren + * @date 2021-05-25 + */ +public class MysqlMonitor implements Runnable{ + + @Override + public void run() { + Dictionary.getSingleton().reLoadMysqlMainDict(); + } +} diff --git a/src/main/resources/plugin-security.policy b/src/main/resources/plugin-security.policy index 55d759a3..c354562c 100644 --- a/src/main/resources/plugin-security.policy +++ b/src/main/resources/plugin-security.policy @@ -1,4 +1,6 @@ grant { // needed because of the hot reload functionality + permission java.lang.RuntimePermission "setContextClassLoader","getClassLoader"; + permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; permission java.net.SocketPermission "*", "connect,resolve"; }; \ No newline at end of file