diff --git a/.travis.yml b/.travis.yml
index c535657e9a..73a97e6aa1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,5 +11,7 @@ addons:
 script:
   - make test_ci DEBUG=1 -j3
   - make clean
+  - make test_ci CFLAGS_USER=-DMCL_DONT_USE_XBYAK -j3
+  - make clean
   - make test_go
  
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c7bf1dede9..bc91a8177a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required (VERSION 2.6)
+cmake_minimum_required (VERSION 3.0)
 project(mcl CXX ASM)
 set(SRCS src/fp.cpp)
 
@@ -12,11 +12,59 @@ option(
 	"download cybozulib_ext"
 	OFF
 )
+option(
+	USE_OPENSSL
+	"use openssl"
+	ON
+)
+option(
+	USE_GMP
+	"use gmp"
+	ON
+)
+option(
+	USE_ASM
+	"use asm"
+	ON
+)
+option(
+	USE_XBYAK
+	"use xbyak"
+	ON
+)
+option(
+	USE_LLVM
+	"use base64.ll with -DCMAKE_CXX_COMPILER=clang++"
+	OFF
+)
+option(
+	ONLY_LIB
+	"only lib"
+	OFF
+)
 
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 
+if(USE_LLVM)
+	add_executable(gen src/gen.cpp)
+	add_custom_target(base64.ll
+		DEPENDS gen
+		SOURCES base64.ll
+	)
+	add_custom_command(OUTPUT base64.ll
+		COMMAND gen > base64.ll
+	)
+	add_custom_target(base64.o
+		DEPENDS base64.ll
+		SOURCES base64.o
+	)
+	add_custom_command(OUTPUT base64.o
+		COMMAND ${CMAKE_CXX_COMPILER} -c -o base64.o base64.ll -O3 -fPIC
+	)
+endif()
+
 if(MSVC)
 	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /MT /W4 /Oy /Ox /EHsc /GS- /Zi /DNDEBUG /DNOMINMAX")
 	set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} /MTd /W4 /DNOMINMAX")
@@ -32,20 +80,43 @@ else()
 		add_definitions(-DMCL_MAX_BIT_SIZE=${MCL_MAX_BIT_SIZE})
 	endif()
 
-	if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64")
-		add_definitions(-DMCL_USE_LLVM=1)
-		set(SRCS ${SRCS} src/asm/aarch64.s)
-		set(CPU arch64)
-	elseif(APPLE)
-		add_definitions(-DMCL_USE_LLVM=1)
-		set(SRCS ${SRCS} src/asm/x86-64mac.s src/asm/x86-64mac.bmi2.s)
-		set(CPU x86-64)
-	elseif(UNIX)
-		add_definitions(-DMCL_USE_LLVM=1)
-		set(SRCS ${SRCS} src/asm/x86-64.s src/asm/x86-64.bmi2.s)
-		set(CPU x86-64)
+	if(USE_LLVM)
+		add_definitions(-DMCL_USE_LLVM=1 -DMCL_LLVM_BMI2=0)
+	elseif(USE_ASM)
+		if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64")
+			add_definitions(-DMCL_USE_LLVM=1)
+			set(SRCS ${SRCS} src/asm/aarch64.s)
+			set(CPU arch64)
+		elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^arm")
+			add_definitions(-DMCL_USE_LLVM=1)
+			set(SRCS ${SRCS} src/asm/arm.s)
+			set(CPU arm)
+		elseif(APPLE)
+			add_definitions(-DMCL_USE_LLVM=1)
+			set(SRCS ${SRCS} src/asm/x86-64mac.s src/asm/x86-64mac.bmi2.s)
+			set(CPU x86-64)
+		elseif(UNIX)
+			add_definitions(-DMCL_USE_LLVM=1)
+			set(SRCS ${SRCS} src/asm/x86-64.s src/asm/x86-64.bmi2.s)
+			set(CPU x86-64)
+		endif()
+	endif()
+	if(USE_GMP)
+		set(EXT_LIBS ${EXT_LIBS} gmp gmpxx)
+	endif()
+	if(USE_OPENSSL)
+		set(EXT_LIBS ${EXT_LIBS} crypto)
 	endif()
-	set(LIBS mcl gmp gmpxx crypto)
+endif()
+
+if(NOT USE_GMP)
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMCL_USE_VINT -DMCL_VINT_FIXED_BUFFER")
+endif()
+if(NOT USE_OPENSSL)
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMCL_DONT_USE_OPENSSL")
+endif()
+if(NOT USE_XBYAK)
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMCL_DONT_USE_XBYAK")
 endif()
 
 if(DOWNLOAD_SOURCE)
@@ -84,31 +155,55 @@ include_directories(
 	${mcl_SOURCE_DIR}/include
 )
 
-add_library(mcl STATIC ${SRCS})
-add_library(mcl_dy SHARED ${SRCS})
-target_link_libraries(mcl_dy ${LIBS})
-set_target_properties(mcl_dy PROPERTIES OUTPUT_NAME mcl VERSION 1.0.0 SOVERSION 1)
+if(USE_LLVM)
+	add_library(mcl SHARED ${SRCS} base64.o)
+	add_library(mcl_st STATIC ${SRCS} base64.o)
+	add_dependencies(mcl base64.o)
+	add_dependencies(mcl_st base64.o)
+else()
+	add_library(mcl SHARED ${SRCS})
+	add_library(mcl_st STATIC ${SRCS})
+endif()
+target_link_libraries(mcl ${EXT_LIBS})
+target_link_libraries(mcl_st ${EXT_LIBS})
+set_target_properties(mcl_st PROPERTIES OUTPUT_NAME mcl)
+#set_target_properties(mcl_st PROPERTIES PREFIX "lib")
+#set_target_properties(mcl PROPERTIES OUTPUT_NAME mcl VERSION 1.0.0 SOVERSION 1)
 # For semantics of ABI compatibility including when you must bump SOVERSION, see:
 # https://community.kde.org/Policies/Binary_Compatibility_Issues_With_C%2B%2B#The_Do.27s_and_Don.27ts
 
+set(LIBS mcl ${EXT_LIBS})
+foreach(bit IN ITEMS 256 384 384_256 512)
+	add_library(mclbn${bit} SHARED src/bn_c${bit}.cpp)
+	target_link_libraries(mclbn${bit} ${LIBS})
+	add_executable(bn_c${bit}_test test/bn_c${bit}_test.cpp)
+	target_link_libraries(bn_c${bit}_test mclbn${bit})
+endforeach()
+
 file(GLOB MCL_HEADERS include/mcl/*.hpp include/mcl/bn.h include/mcl/curve_type.h)
 file(GLOB CYBOZULIB_HEADERS include/cybozu/*.hpp)
 
 install(TARGETS mcl DESTINATION lib)
-install(TARGETS mcl_dy DESTINATION lib)
+install(TARGETS mcl_st DESTINATION lib)
+install(TARGETS mclbn256 DESTINATION lib)
+install(TARGETS mclbn384 DESTINATION lib)
+install(TARGETS mclbn384_256 DESTINATION lib)
+install(TARGETS mclbn512 DESTINATION lib)
 install(FILES ${MCL_HEADERS} DESTINATION include/mcl)
 install(FILES include/mcl/impl/bn_c_impl.hpp DESTINATION include/mcl/impl)
 install(FILES ${CYBOZULIB_HEADERS} DESTINATION include/cybozu)
 
-set(TEST_BASE fp_test ec_test fp_util_test window_method_test elgamal_test fp_tower_test gmp_test bn_test glv_test)
-#set(TEST_BASE bn_test)
-foreach(base IN ITEMS ${TEST_BASE})
-	add_executable(
-		${base}
-		test/${base}.cpp
-	)
-	target_link_libraries(
-		${base}
-		${LIBS}
-	)
-endforeach()
+if(NOT ONLY_LIB)
+	set(TEST_BASE fp_test ec_test fp_util_test window_method_test elgamal_test fp_tower_test gmp_test bn_test glv_test)
+	#set(TEST_BASE bn_test)
+	foreach(base IN ITEMS ${TEST_BASE})
+		add_executable(
+			${base}
+			test/${base}.cpp
+		)
+		target_link_libraries(
+			${base}
+			${LIBS}
+		)
+	endforeach()
+endif()
diff --git a/Makefile b/Makefile
index 7920d61531..6273490e28 100644
--- a/Makefile
+++ b/Makefile
@@ -4,10 +4,12 @@ OBJ_DIR=obj
 EXE_DIR=bin
 SRC_SRC=fp.cpp bn_c256.cpp bn_c384.cpp bn_c512.cpp she_c256.cpp
 TEST_SRC=fp_test.cpp ec_test.cpp fp_util_test.cpp window_method_test.cpp elgamal_test.cpp fp_tower_test.cpp gmp_test.cpp bn_test.cpp bn384_test.cpp glv_test.cpp paillier_test.cpp she_test.cpp vint_test.cpp bn512_test.cpp ecdsa_test.cpp conversion_test.cpp
-TEST_SRC+=bn_c256_test.cpp bn_c384_test.cpp bn_c384_256_test.cpp bn_c512_test.cpp she_c256_test.cpp she_c384_test.cpp
+TEST_SRC+=bn_c256_test.cpp bn_c384_test.cpp bn_c384_256_test.cpp bn_c512_test.cpp
+TEST_SRC+=she_c256_test.cpp she_c384_test.cpp she_c384_256_test.cpp
 TEST_SRC+=aggregate_sig_test.cpp array_test.cpp
 TEST_SRC+=bls12_test.cpp
 TEST_SRC+=ecdsa_c_test.cpp
+TEST_SRC+=modp_test.cpp
 ifeq ($(CPU),x86-64)
   MCL_USE_XBYAK?=1
   TEST_SRC+=mont_fp_test.cpp sq_test.cpp
@@ -34,6 +36,8 @@ BN384_SNAME=mclbn384
 BN384_256_SNAME=mclbn384_256
 BN512_SNAME=mclbn512
 SHE256_SNAME=mclshe256
+SHE384_SNAME=mclshe384
+SHE384_256_SNAME=mclshe384_256
 MCL_SLIB=$(LIB_DIR)/lib$(MCL_SNAME).$(LIB_SUF)
 BN256_LIB=$(LIB_DIR)/libmclbn256.a
 BN256_SLIB=$(LIB_DIR)/lib$(BN256_SNAME).$(LIB_SUF)
@@ -44,19 +48,25 @@ BN384_256_SLIB=$(LIB_DIR)/lib$(BN384_256_SNAME).$(LIB_SUF)
 BN512_LIB=$(LIB_DIR)/libmclbn512.a
 BN512_SLIB=$(LIB_DIR)/lib$(BN512_SNAME).$(LIB_SUF)
 SHE256_LIB=$(LIB_DIR)/libmclshe256.a
+SHE256_SLIB=$(LIB_DIR)/lib$(SHE256_SNAME).$(LIB_SUF)
 SHE384_LIB=$(LIB_DIR)/libmclshe384.a
+SHE384_SLIB=$(LIB_DIR)/lib$(SHE384_SNAME).$(LIB_SUF)
+SHE384_256_LIB=$(LIB_DIR)/libmclshe384_256.a
+SHE384_256_SLIB=$(LIB_DIR)/lib$(SHE384_256_SNAME).$(LIB_SUF)
 ECDSA_LIB=$(LIB_DIR)/libmclecdsa.a
-all: $(MCL_LIB) $(MCL_SLIB) $(BN256_LIB) $(BN256_SLIB) $(BN384_LIB) $(BN384_SLIB) $(BN384_256_LIB) $(BN384_256_SLIB) $(BN512_LIB) $(BN512_SLIB) $(SHE256_LIB) $(SHE384_lib) $(ECDSA_LIB)
+SHE_LIB_ALL=$(SHE256_LIB) $(SHE256_SLIB) $(SHE384_LIB) $(SHE384_SLIB) $(SHE384_256_LIB) $(SHE384_256_SLIB)
+all: $(MCL_LIB) $(MCL_SLIB) $(BN256_LIB) $(BN256_SLIB) $(BN384_LIB) $(BN384_SLIB) $(BN384_256_LIB) $(BN384_256_SLIB) $(BN512_LIB) $(BN512_SLIB) $(SHE_LIB_ALL) $(ECDSA_LIB)
 
 #LLVM_VER=-3.8
 LLVM_LLC=llc$(LLVM_VER)
 LLVM_OPT=opt$(LLVM_VER)
 LLVM_OPT_VERSION=$(shell $(LLVM_OPT) --version 2>/dev/null | awk '/version/ {print $$3}')
 GEN_EXE=src/gen
+GEN_EXE_OPT=-u $(BIT)
 # incompatibility between llvm 3.4 and the later version
 ifneq ($(LLVM_OPT_VERSION),)
 ifeq ($(shell expr $(LLVM_OPT_VERSION) \< 3.5.0),1)
-  GEN_EXE_OPT=-old
+  GEN_EXE_OPT+=-old
 endif
 endif
 ifeq ($(OS),mac)
@@ -75,6 +85,7 @@ BN384_256_OBJ=$(OBJ_DIR)/bn_c384_256.o
 BN512_OBJ=$(OBJ_DIR)/bn_c512.o
 SHE256_OBJ=$(OBJ_DIR)/she_c256.o
 SHE384_OBJ=$(OBJ_DIR)/she_c384.o
+SHE384_256_OBJ=$(OBJ_DIR)/she_c384_256.o
 ECDSA_OBJ=$(OBJ_DIR)/ecdsa_c.o
 FUNC_LIST=src/func.list
 ifeq ($(findstring $(OS),mingw64/cygwin),)
@@ -120,6 +131,9 @@ ifneq ($(findstring $(OS),mac/mingw64),)
   BN384_SLIB_LDFLAGS+=-l$(MCL_SNAME) -L./lib
   BN384_256_SLIB_LDFLAGS+=-l$(MCL_SNAME) -L./lib
   BN512_SLIB_LDFLAGS+=-l$(MCL_SNAME) -L./lib
+  SHE256_SLIB_LDFLAGS+=-l$(MCL_SNAME) -L./lib
+  SHE384_SLIB_LDFLAGS+=-l$(MCL_SNAME) -L./lib
+  SHE384_256_SLIB_LDFLAGS+=-l$(MCL_SNAME) -L./lib
 endif
 ifeq ($(OS),mingw64)
   MCL_SLIB_LDFLAGS+=-Wl,--out-implib,$(LIB_DIR)/lib$(MCL_SNAME).a
@@ -127,6 +141,9 @@ ifeq ($(OS),mingw64)
   BN384_SLIB_LDFLAGS+=-Wl,--out-implib,$(LIB_DIR)/lib$(BN384_SNAME).a
   BN384_256_SLIB_LDFLAGS+=-Wl,--out-implib,$(LIB_DIR)/lib$(BN384_256_SNAME).a
   BN512_SLIB_LDFLAGS+=-Wl,--out-implib,$(LIB_DIR)/lib$(BN512_SNAME).a
+  SHE256_SLIB_LDFLAGS+=-Wl,--out-implib,$(LIB_DIR)/lib$(SHE256_SNAME).a
+  SHE384_SLIB_LDFLAGS+=-Wl,--out-implib,$(LIB_DIR)/lib$(SHE384_SNAME).a
+  SHE384_256_SLIB_LDFLAGS+=-Wl,--out-implib,$(LIB_DIR)/lib$(SHE384_256_SNAME).a
 endif
 
 $(MCL_LIB): $(LIB_OBJ)
@@ -144,6 +161,18 @@ $(SHE256_LIB): $(SHE256_OBJ)
 $(SHE384_LIB): $(SHE384_OBJ)
 	$(AR) $@ $(SHE384_OBJ)
 
+$(SHE384_256_LIB): $(SHE384_256_OBJ)
+	$(AR) $@ $(SHE384_256_OBJ)
+
+$(SHE256_SLIB): $(SHE256_OBJ) $(MCL_LIB)
+	$(PRE)$(CXX) -o $@ $(SHE256_OBJ) $(MCL_LIB) -shared $(LDFLAGS) $(SHE256_SLIB_LDFLAGS)
+
+$(SHE384_SLIB): $(SHE384_OBJ) $(MCL_LIB)
+	$(PRE)$(CXX) -o $@ $(SHE384_OBJ) $(MCL_LIB) -shared $(LDFLAGS) $(SHE384_SLIB_LDFLAGS)
+
+$(SHE384_256_SLIB): $(SHE384_256_OBJ) $(MCL_LIB)
+	$(PRE)$(CXX) -o $@ $(SHE384_256_OBJ) $(MCL_LIB) -shared $(LDFLAGS) $(SHE384_256_SLIB_LDFLAGS)
+
 $(ECDSA_LIB): $(ECDSA_OBJ)
 	$(AR) $@ $(ECDSA_OBJ)
 
@@ -223,6 +252,11 @@ test_go:
 	$(MAKE) test_go384
 	$(MAKE) test_go384_256
 
+test_python_she: $(SHE256_SLIB)
+	cd ffi/python && env LD_LIBRARY_PATH="../../lib" DYLD_LIBRARY_PATH="../../lib" PATH=$$PATH:"../../lib" python3 she.py
+test_python:
+	$(MAKE) test_python_she
+
 test_java:
 	$(MAKE) -C ffi/java test
 
@@ -262,9 +296,18 @@ $(EXE_DIR)/she_c256_test.exe: $(OBJ_DIR)/she_c256_test.o $(SHE256_LIB) $(MCL_LIB
 $(EXE_DIR)/she_c384_test.exe: $(OBJ_DIR)/she_c384_test.o $(SHE384_LIB) $(MCL_LIB)
 	$(PRE)$(CXX) $< -o $@ $(SHE384_LIB) $(MCL_LIB) $(LDFLAGS)
 
+$(EXE_DIR)/she_c384_256_test.exe: $(OBJ_DIR)/she_c384_256_test.o $(SHE384_256_LIB) $(MCL_LIB)
+	$(PRE)$(CXX) $< -o $@ $(SHE384_256_LIB) $(MCL_LIB) $(LDFLAGS)
+
 $(EXE_DIR)/ecdsa_c_test.exe: $(OBJ_DIR)/ecdsa_c_test.o $(ECDSA_LIB) $(MCL_LIB) src/ecdsa_c.cpp include/mcl/ecdsa.hpp include/mcl/ecdsa.h
 	$(PRE)$(CXX) $< -o $@ $(ECDSA_LIB) $(MCL_LIB) $(LDFLAGS)
 
+$(OBJ_DIR)/modp_test.o: test/modp_test.cpp
+	$(PRE)$(CXX) -c $< -o $@ -MMD -MP -MF $(@:.o=.d) -DMCL_USE_VINT -DMCL_MAX_BIT_SIZE=384 -DMCL_VINT_64BIT_PORTABLE -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER -I./include -O2 $(CFLAGS_WARN)
+
+$(EXE_DIR)/modp_test.exe: $(OBJ_DIR)/modp_test.o
+	$(PRE)$(CXX) $< -o $@
+
 SAMPLE_EXE=$(addprefix $(EXE_DIR)/,$(addsuffix .exe,$(basename $(SAMPLE_SRC))))
 sample: $(SAMPLE_EXE) $(MCL_LIB)
 
@@ -317,8 +360,7 @@ ecdsa-wasm:
 bin/emu:
 	$(CXX) -g -o $@ src/fp.cpp src/bn_c256.cpp test/bn_c256_test.cpp -DMCL_DONT_USE_XBYAK -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_64BIT_PORTABLE -DMCL_VINT_FIXED_BUFFER -DMCL_MAX_BIT_SIZE=256 -I./include
 bin/pairing_c_min.exe: sample/pairing_c.c include/mcl/vint.hpp src/fp.cpp include/mcl/bn.hpp
-#	$(CXX) -o $@ sample/pairing_c.c src/fp.cpp src/bn_c256.cpp -O2 -g -I./include -fno-exceptions -fno-rtti -fno-threadsafe-statics -DMCL_DONT_USE_XBYAK -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER -DCYBOZU_DONT_USE_EXCEPTION -DCYBOZU_DONT_USE_STRING -DMCL_DONT_USE_CSPRNG -DMCL_MAX_BIT_SIZE=256 -DMCL_VINT_64BIT_PORTABLE -DNDEBUG -pg
-	$(CXX) -o $@ sample/pairing_c.c src/fp.cpp src/bn_c256.cpp -O2 -g -I./include -fno-threadsafe-statics -DMCL_DONT_USE_XBYAK -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER -DMCL_DONT_USE_CSPRNG -DMCL_MAX_BIT_SIZE=256 -DMCL_VINT_64BIT_PORTABLE -DNDEBUG
+	$(CXX) -o $@ sample/pairing_c.c src/fp.cpp src/bn_c256.cpp -O3 -g -I./include -fno-threadsafe-statics -DMCL_DONT_USE_XBYAK -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER -DMCL_MAX_BIT_SIZE=256 -DMCL_VINT_64BIT_PORTABLE -DCYBOZU_DONT_USE_STRING -DCYBOZU_DONT_USE_EXCEPTION -DNDEBUG # -DMCL_DONT_USE_CSPRNG
 
 make_tbl:
 	$(MAKE) ../bls/src/qcoeff-bn254.hpp
@@ -334,7 +376,7 @@ update_cybozulib:
 	cp -a $(addprefix ../cybozulib/,$(wildcard include/cybozu/*.hpp)) include/cybozu/
 
 clean:
-	$(RM) $(LIB_DIR)/*.a $(LIB_DIR)/*.$(LIB_SUF) $(OBJ_DIR)/*.o $(OBJ_DIR)/*.obj $(OBJ_DIR)/*.d $(EXE_DIR)/*.exe $(GEN_EXE) $(ASM_OBJ) $(LIB_OBJ) $(BN256_OBJ) $(BN384_OBJ) $(BN512_OBJ) $(LLVM_SRC) $(FUNC_LIST) src/*.ll lib/*.a
+	$(RM) $(LIB_DIR)/*.a $(LIB_DIR)/*.$(LIB_SUF) $(OBJ_DIR)/*.o $(OBJ_DIR)/*.obj $(OBJ_DIR)/*.d $(EXE_DIR)/*.exe $(GEN_EXE) $(ASM_OBJ) $(LIB_OBJ) $(BN256_OBJ) $(BN384_OBJ) $(BN512_OBJ) $(FUNC_LIST) src/*.ll lib/*.a
 
 ALL_SRC=$(SRC_SRC) $(TEST_SRC) $(SAMPLE_SRC)
 DEPEND_FILE=$(addprefix $(OBJ_DIR)/, $(addsuffix .d,$(basename $(ALL_SRC))))
diff --git a/ffi/cs/bn256.cs b/ffi/cs/bn256.cs
index 8bd0200947..22169d1bfd 100644
--- a/ffi/cs/bn256.cs
+++ b/ffi/cs/bn256.cs
@@ -5,7 +5,7 @@
 namespace mcl {
 	public class BN256 {
 		[DllImport("mclBn256.dll")]
-		public static extern int mclBn_init(int curve, int maxUnitSize);
+		public static extern int mclBn_init(int curve, int compiledTimeVar);
 		[DllImport("mclBn256.dll")]
 		public static extern void mclBnFr_clear(ref Fr x);
 		[DllImport("mclBn256.dll")]
@@ -128,8 +128,10 @@ public class BN256 {
 		public static void init()
 		{
 			const int curveFp254BNb = 0;
-			const int maxUnitSize = 4;
-			if (mclBn_init(curveFp254BNb, maxUnitSize) != 0) {
+            const int MCLBN_FR_UNIT_SIZE = 4;
+            const int MCLBN_FP_UNIT_SIZE = 4;
+            const int MCLBN_COMPILED_TIME_VAR  = (MCLBN_FR_UNIT_SIZE) * 10 + (MCLBN_FP_UNIT_SIZE);
+            if (mclBn_init(curveFp254BNb, MCLBN_COMPILED_TIME_VAR) != 0) {
 				throw new InvalidOperationException("mclBn_init");
 			}
 		}
diff --git a/ffi/go/mcl/init.go b/ffi/go/mcl/init.go
new file mode 100644
index 0000000000..aaa7a7cdc1
--- /dev/null
+++ b/ffi/go/mcl/init.go
@@ -0,0 +1,25 @@
+package mcl
+
+/*
+#cgo bn256 CFLAGS:-DMCLBN_FP_UNIT_SIZE=4
+#cgo bn384 CFLAGS:-DMCLBN_FP_UNIT_SIZE=6
+#cgo bn384_256 CFLAGS:-DMCLBN_FP_UNIT_SIZE=6 -DMCLBN_FR_UNIT_SIZE=4
+#cgo bn256 LDFLAGS:-lmclbn256 -lmcl
+#cgo bn384 LDFLAGS:-lmclbn384 -lmcl
+#cgo bn384_256 LDFLAGS:-lmclbn384_256 -lmcl
+#include <mcl/bn.h>
+*/
+import "C"
+import "fmt"
+// Init --
+// call this function before calling all the other operations
+// this function is not thread safe
+func Init(curve int) error {
+	err := C.mclBn_init(C.int(curve), C.MCLBN_COMPILED_TIME_VAR)
+	if err != 0 {
+		return fmt.Errorf("ERR mclBn_init curve=%d", curve)
+	}
+	return nil
+}
+
+
diff --git a/ffi/go/mcl/mcl.go b/ffi/go/mcl/mcl.go
index a0c8bb4d36..fbc439b47a 100644
--- a/ffi/go/mcl/mcl.go
+++ b/ffi/go/mcl/mcl.go
@@ -4,9 +4,6 @@ package mcl
 #cgo bn256 CFLAGS:-DMCLBN_FP_UNIT_SIZE=4
 #cgo bn384 CFLAGS:-DMCLBN_FP_UNIT_SIZE=6
 #cgo bn384_256 CFLAGS:-DMCLBN_FP_UNIT_SIZE=6 -DMCLBN_FR_UNIT_SIZE=4
-#cgo bn256 LDFLAGS:-lmclbn256 -lmcl
-#cgo bn384 LDFLAGS:-lmclbn384 -lmcl
-#cgo bn384_256 LDFLAGS:-lmclbn384_256 -lmcl
 #include <mcl/bn.h>
 */
 import "C"
@@ -28,17 +25,6 @@ const BLS12_381 = C.MCL_BLS12_381
 // IoSerializeHexStr
 const IoSerializeHexStr = C.MCLBN_IO_SERIALIZE_HEX_STR
 
-// Init --
-// call this function before calling all the other operations
-// this function is not thread safe
-func Init(curve int) error {
-	err := C.mclBn_init(C.int(curve), C.MCLBN_COMPILED_TIME_VAR)
-	if err != 0 {
-		return fmt.Errorf("ERR mclBn_init curve=%d", curve)
-	}
-	return nil
-}
-
 // GetFrUnitSize() --
 func GetFrUnitSize() int {
 	return int(C.MCLBN_FR_UNIT_SIZE)
diff --git a/ffi/python/she.py b/ffi/python/she.py
new file mode 100644
index 0000000000..4234a67679
--- /dev/null
+++ b/ffi/python/she.py
@@ -0,0 +1,369 @@
+import os
+import platform
+from ctypes import *
+
+BN254 = 0
+BLS12_381 = 5
+MCLBN_FR_UNIT_SIZE = 4
+MCLBN_FP_UNIT_SIZE = 6
+
+FR_SIZE = MCLBN_FR_UNIT_SIZE
+G1_SIZE = MCLBN_FP_UNIT_SIZE * 3
+G2_SIZE = MCLBN_FP_UNIT_SIZE * 6
+GT_SIZE = MCLBN_FP_UNIT_SIZE * 12
+
+SEC_SIZE = FR_SIZE * 2
+PUB_SIZE = G1_SIZE + G2_SIZE
+G1_CIPHER_SIZE = G1_SIZE * 2
+G2_CIPHER_SIZE = G2_SIZE * 2
+GT_CIPHER_SIZE = GT_SIZE * 4
+
+MCLBN_COMPILED_TIME_VAR = (MCLBN_FR_UNIT_SIZE * 10) + MCLBN_FP_UNIT_SIZE
+
+Buffer = c_ubyte * 2304
+lib = None
+
+def init(curveType=BN254):
+	global lib
+	name = platform.system()
+	if name == 'Linux':
+		libName = 'libmclshe384_256.so'
+	elif name == 'Darwin':
+		libName = 'libmclshe384_256.dylib'
+	elif name == 'Windows':
+		libName = 'mclshe384_256.dll'
+	else:
+		raise RuntimeError("not support yet", name)
+	lib = cdll.LoadLibrary(libName)
+	ret = lib.sheInit(curveType, MCLBN_COMPILED_TIME_VAR)
+	if ret != 0:
+		raise RuntimeError("sheInit", ret)
+	lib.mclBn_verifyOrderG1(0)
+	lib.mclBn_verifyOrderG2(0)
+	# custom setup for a function which returns pointer
+	lib.shePrecomputedPublicKeyCreate.restype = c_void_p
+
+def setRangeForDLP(hashSize):
+	ret = lib.sheSetRangeForDLP(hashSize)
+	if ret != 0:
+		raise RuntimeError("setRangeForDLP", ret)
+
+def setTryNum(tryNum):
+	lib.sheSetTryNum(tryNum)
+
+def _hexStr(v):
+	s = ""
+	for x in v:
+		s += format(x, '02x')
+	return s
+
+def _serialize(self, f):
+	buf = Buffer()
+	ret = f(byref(buf), len(buf), byref(self.v))
+	if ret == 0:
+		raise RuntimeError("serialize")
+	return buf[0:ret]
+
+def _deserialize(cstr, f, buf):
+	x = cstr()
+	ca = (c_ubyte * len(buf))(*buf)
+	ret = f(byref(x.v), byref(ca), len(buf))
+	if ret == 0:
+		raise RuntimeError("deserialize")
+	return x
+
+class CipherTextG1(Structure):
+	_fields_ = [("v", c_ulonglong * G1_CIPHER_SIZE)]
+	def serialize(self):
+		return _serialize(self, lib.sheCipherTextG1Serialize)
+	def serializeToHexStr(self):
+		return _hexStr(self.serialize())
+
+class CipherTextG2(Structure):
+	_fields_ = [("v", c_ulonglong * G2_CIPHER_SIZE)]
+	def serialize(self):
+		return _serialize(self, lib.sheCipherTextG2Serialize)
+	def serializeToHexStr(self):
+		return _hexStr(self.serialize())
+
+class CipherTextGT(Structure):
+	_fields_ = [("v", c_ulonglong * GT_CIPHER_SIZE)]
+	def serialize(self):
+		return _serialize(self, lib.sheCipherTextGTSerialize)
+	def serializeToHexStr(self):
+		return _hexStr(self.serialize())
+
+def _enc(CT, enc, encIntVec, neg, p, m):
+	c = CT()
+	if -0x80000000 <= m <= 0x7fffffff:
+		ret = enc(byref(c.v), p, m)
+		if ret != 0:
+			raise RuntimeError("enc", m)
+		return c
+	if m < 0:
+		minus = True
+		m = -m
+	else:
+		minus = False
+	if m >= 1 << (MCLBN_FR_UNIT_SIZE * 64):
+		raise RuntimeError("enc:too large m", m)
+	a = []
+	while m > 0:
+		a.append(m & 0xffffffff)
+		m >>= 32
+	ca = (c_uint * len(a))(*a)
+	ret = encIntVec(byref(c.v), p, byref(ca), sizeof(ca))
+	if ret != 0:
+		raise RuntimeError("enc:IntVec", m)
+	if minus:
+		ret = neg(byref(c.v), byref(c.v))
+		if ret != 0:
+			raise RuntimeError("enc:neg", m)
+	return c
+
+class PrecomputedPublicKey(Structure):
+	def __init__(self):
+		self.p = 0
+	def create(self):
+		if not self.p:
+			self.p = c_void_p(lib.shePrecomputedPublicKeyCreate())
+			if self.p == 0:
+				raise RuntimeError("PrecomputedPublicKey::create")
+	def destroy(self):
+		lib.shePrecomputedPublicKeyDestroy(self.p)
+	def encG1(self, m):
+		return _enc(CipherTextG1, lib.shePrecomputedPublicKeyEncG1, lib.shePrecomputedPublicKeyEncIntVecG1, lib.sheNegG1, self.p, m)
+	def encG2(self, m):
+		return _enc(CipherTextG2, lib.shePrecomputedPublicKeyEncG2, lib.shePrecomputedPublicKeyEncIntVecG2, lib.sheNegG2, self.p, m)
+	def encGT(self, m):
+		return _enc(CipherTextGT, lib.shePrecomputedPublicKeyEncGT, lib.shePrecomputedPublicKeyEncIntVecGT, lib.sheNegGT, self.p, m)
+
+class PublicKey(Structure):
+	_fields_ = [("v", c_ulonglong * PUB_SIZE)]
+	def serialize(self):
+		return _serialize(self, lib.shePublicKeySerialize)
+	def serializeToHexStr(self):
+		return _hexStr(self.serialize())
+	def encG1(self, m):
+		return _enc(CipherTextG1, lib.sheEncG1, lib.sheEncIntVecG1, lib.sheNegG1, byref(self.v), m)
+	def encG2(self, m):
+		return _enc(CipherTextG2, lib.sheEncG2, lib.sheEncIntVecG2, lib.sheNegG2, byref(self.v), m)
+	def encGT(self, m):
+		return _enc(CipherTextGT, lib.sheEncGT, lib.sheEncIntVecGT, lib.sheNegGT, byref(self.v), m)
+	def createPrecomputedPublicKey(self):
+		ppub = PrecomputedPublicKey()
+		ppub.create()
+		ret = lib.shePrecomputedPublicKeyInit(ppub.p, byref(self.v))
+		if ret != 0:
+			raise RuntimeError("createPrecomputedPublicKey")
+		return ppub
+
+class SecretKey(Structure):
+	_fields_ = [("v", c_ulonglong * SEC_SIZE)]
+	def setByCSPRNG(self):
+		ret = lib.sheSecretKeySetByCSPRNG(byref(self.v))
+		if ret != 0:
+			raise RuntimeError("setByCSPRNG", ret)
+	def serialize(self):
+		return _serialize(self, lib.sheSecretKeySerialize)
+	def serializeToHexStr(self):
+		return _hexStr(self.serialize())
+	def getPulicKey(self):
+		pub = PublicKey()
+		lib.sheGetPublicKey(byref(pub.v), byref(self.v))
+		return pub
+	def dec(self, c):
+		m = c_longlong()
+		if isinstance(c, CipherTextG1):
+			ret = lib.sheDecG1(byref(m), byref(self.v), byref(c.v))
+		elif isinstance(c, CipherTextG2):
+			ret = lib.sheDecG2(byref(m), byref(self.v), byref(c.v))
+		elif isinstance(c, CipherTextGT):
+			ret = lib.sheDecGT(byref(m), byref(self.v), byref(c.v))
+		if ret != 0:
+			raise RuntimeError("dec")
+		return m.value
+	def isZero(self, c):
+		if isinstance(c, CipherTextG1):
+			return lib.sheIsZeroG1(byref(self.v), byref(c.v)) == 1
+		elif isinstance(c, CipherTextG2):
+			return lib.sheIsZeroG2(byref(self.v), byref(c.v)) == 1
+		elif isinstance(c, CipherTextGT):
+			return lib.sheIsZeroGT(byref(self.v), byref(c.v)) == 1
+		raise RuntimeError("dec")
+
+def neg(c):
+	ret = -1
+	if isinstance(c, CipherTextG1):
+		out = CipherTextG1()
+		ret = lib.sheNegG1(byref(out.v), byref(c.v))
+	elif isinstance(c, CipherTextG2):
+		out = CipherTextG2()
+		ret = lib.sheNegG2(byref(out.v), byref(c.v))
+	elif isinstance(c, CipherTextGT):
+		out = CipherTextGT()
+		ret = lib.sheNegGT(byref(out.v), byref(c.v))
+	if ret != 0:
+		raise RuntimeError("neg")
+	return out
+
+def add(cx, cy):
+	ret = -1
+	if isinstance(cx, CipherTextG1) and isinstance(cy, CipherTextG1):
+		out = CipherTextG1()
+		ret = lib.sheAddG1(byref(out.v), byref(cx.v), byref(cy.v))
+	elif isinstance(cx, CipherTextG2) and isinstance(cy, CipherTextG2):
+		out = CipherTextG2()
+		ret = lib.sheAddG2(byref(out.v), byref(cx.v), byref(cy.v))
+	elif isinstance(cx, CipherTextGT) and isinstance(cy, CipherTextGT):
+		out = CipherTextGT()
+		ret = lib.sheAddGT(byref(out.v), byref(cx.v), byref(cy.v))
+	if ret != 0:
+		raise RuntimeError("add")
+	return out
+
+def sub(cx, cy):
+	ret = -1
+	if isinstance(cx, CipherTextG1) and isinstance(cy, CipherTextG1):
+		out = CipherTextG1()
+		ret = lib.sheSubG1(byref(out.v), byref(cx.v), byref(cy.v))
+	elif isinstance(cx, CipherTextG2) and isinstance(cy, CipherTextG2):
+		out = CipherTextG2()
+		ret = lib.sheSubG2(byref(out.v), byref(cx.v), byref(cy.v))
+	elif isinstance(cx, CipherTextGT) and isinstance(cy, CipherTextGT):
+		out = CipherTextGT()
+		ret = lib.sheSubGT(byref(out.v), byref(cx.v), byref(cy.v))
+	if ret != 0:
+		raise RuntimeError("sub")
+	return out
+
+def mul(cx, cy):
+	ret = -1
+	if isinstance(cx, CipherTextG1) and isinstance(cy, CipherTextG2):
+		out = CipherTextGT()
+		ret = lib.sheMul(byref(out.v), byref(cx.v), byref(cy.v))
+	elif isinstance(cx, CipherTextG1) and (isinstance(cy, int) or isinstance(cy, long)):
+		return _enc(CipherTextG1, lib.sheMulG1, lib.sheMulIntVecG1, lib.sheNegG1, byref(cx.v), cy)
+	elif isinstance(cx, CipherTextG2) and (isinstance(cy, int) or isinstance(cy, long)):
+		return _enc(CipherTextG2, lib.sheMulG2, lib.sheMulIntVecG2, lib.sheNegG2, byref(cx.v), cy)
+	elif isinstance(cx, CipherTextGT) and (isinstance(cy, int) or isinstance(cy, long)):
+		return _enc(CipherTextGT, lib.sheMulGT, lib.sheMulIntVecGT, lib.sheNegGT, byref(cx.v), cy)
+	if ret != 0:
+		raise RuntimeError("mul")
+	return out
+
+def deserializeToSecretKey(buf):
+	return _deserialize(SecretKey, lib.sheSecretKeyDeserialize, buf)
+
+def deserializeToPublicKey(buf):
+	return _deserialize(PublicKey, lib.shePublicKeyDeserialize, buf)
+
+def deserializeToCipherTextG1(buf):
+	return _deserialize(CipherTextG1, lib.sheCipherTextG1Deserialize, buf)
+
+def deserializeToCipherTextG2(buf):
+	return _deserialize(CipherTextG2, lib.sheCipherTextG2Deserialize, buf)
+
+def deserializeToCipherTextGT(buf):
+	return _deserialize(CipherTextGT, lib.sheCipherTextGTDeserialize, buf)
+
+if __name__ == '__main__':
+	init(BLS12_381)
+	sec = SecretKey()
+	sec.setByCSPRNG()
+	print("sec=", sec.serializeToHexStr())
+	pub = sec.getPulicKey()
+	print("pub=", pub.serializeToHexStr())
+	if sec.serialize() != deserializeToSecretKey(sec.serialize()).serialize(): print("err-ser1")
+	if pub.serialize() != deserializeToPublicKey(pub.serialize()).serialize(): print("err-ser2")
+
+	m11 = 1
+	m12 = 5
+	m21 = 3
+	m22 = -4
+	c11 = pub.encG1(m11)
+	c12 = pub.encG1(m12)
+	# dec(enc) for G1
+	if sec.dec(c11) != m11: print("err1")
+
+	# add/sub for G1
+	if sec.dec(add(c11, c12)) != m11 + m12: print("err2")
+	if sec.dec(sub(c11, c12)) != m11 - m12: print("err3")
+
+	# add/sub for G2
+	c21 = pub.encG2(m21)
+	c22 = pub.encG2(m22)
+	if sec.dec(c21) != m21: print("err4")
+	if sec.dec(add(c21, c22)) != m21 + m22: print("err5")
+	if sec.dec(sub(c21, c22)) != m21 - m22: print("err6")
+
+	# mul const for G1/G2
+	if sec.dec(mul(c11, 3)) != m11 * 3: print("err_mul1")
+	if sec.dec(mul(c21, 7)) != m21 * 7: print("err_mul2")
+
+	if c11.serialize() != deserializeToCipherTextG1(c11.serialize()).serialize(): print("err-ser3")
+	if c21.serialize() != deserializeToCipherTextG2(c21.serialize()).serialize(): print("err-ser3")
+
+	# large integer
+	m1 = 0x140712384712047127412964192876419276341
+	m2 = -m1 + 123
+	c1 = pub.encG1(m1)
+	c2 = pub.encG1(m2)
+	if sec.dec(add(c1, c2)) != 123: print("err-large11")
+	c1 = mul(pub.encG1(1), m1)
+	if sec.dec(add(c1, c2)) != 123: print("err-large12")
+
+	c1 = pub.encG2(m1)
+	c2 = pub.encG2(m2)
+	if sec.dec(add(c1, c2)) != 123: print("err-large21")
+	c1 = mul(pub.encG2(1), m1)
+	if sec.dec(add(c1, c2)) != 123: print("err-large22")
+
+	c1 = pub.encGT(m1)
+	c2 = pub.encGT(m2)
+	if sec.dec(add(c1, c2)) != 123: print("err-large31")
+	c1 = mul(pub.encGT(1), m1)
+	if sec.dec(add(c1, c2)) != 123: print("err-large32")
+	if c1.serialize() != deserializeToCipherTextGT(c1.serialize()).serialize(): print("err-ser4")
+
+	mt = -56
+	ct = pub.encGT(mt)
+	if sec.dec(ct) != mt: print("err7")
+
+	# mul G1 and G2
+	if sec.dec(mul(c11, c21)) != m11 * m21: print("err8")
+
+	if not sec.isZero(pub.encG1(0)): print("err-zero11")
+	if sec.isZero(pub.encG1(3)): print("err-zero12")
+	if not sec.isZero(pub.encG2(0)): print("err-zero21")
+	if sec.isZero(pub.encG2(3)): print("err-zero22")
+	if not sec.isZero(pub.encGT(0)): print("err-zero31")
+	if sec.isZero(pub.encGT(3)): print("err-zero32")
+
+	# use precomputedPublicKey for performance
+	ppub = pub.createPrecomputedPublicKey()
+	c1 = ppub.encG1(m11)
+	if sec.dec(c1) != m11: print("err9")
+
+	# large integer for precomputedPublicKey
+	m1 = 0x140712384712047127412964192876419276341
+	m2 = -m1 + 123
+	c1 = ppub.encG1(m1)
+	c2 = ppub.encG1(m2)
+	if sec.dec(add(c1, c2)) != 123: print("err10")
+	c1 = ppub.encG2(m1)
+	c2 = ppub.encG2(m2)
+	if sec.dec(add(c1, c2)) != 123: print("err11")
+	c1 = ppub.encGT(m1)
+	c2 = ppub.encGT(m2)
+	if sec.dec(add(c1, c2)) != 123: print("err12")
+
+	import sys
+	if sys.version_info.major >= 3:
+		import timeit
+		N = 100000
+		print(str(timeit.timeit("pub.encG1(12)", number=N, globals=globals()) / float(N) * 1e3) + "msec")
+		print(str(timeit.timeit("ppub.encG1(12)", number=N, globals=globals()) / float(N) * 1e3) + "msec")
+
+	ppub.destroy() # necessary to avoid memory leak
+
diff --git a/include/cybozu/random_generator.hpp b/include/cybozu/random_generator.hpp
index ff4a78da5d..375db06a8d 100644
--- a/include/cybozu/random_generator.hpp
+++ b/include/cybozu/random_generator.hpp
@@ -7,7 +7,9 @@
 	http://opensource.org/licenses/BSD-3-Clause
 */
 
+#ifndef CYBOZU_DONT_USE_EXCEPTION
 #include <cybozu/exception.hpp>
+#endif
 #ifdef _WIN32
 #include <winsock2.h>
 #include <windows.h>
@@ -27,22 +29,6 @@ class RandomGenerator {
 	RandomGenerator(const RandomGenerator&);
 	void operator=(const RandomGenerator&);
 public:
-	uint32_t operator()()
-	{
-		return get32();
-	}
-	uint32_t get32()
-	{
-		uint32_t ret;
-		read(&ret, 1);
-		return ret;
-	}
-	uint64_t get64()
-	{
-		uint64_t ret;
-		read(&ret, 1);
-		return ret;
-	}
 #ifdef _WIN32
 	RandomGenerator()
 		: prov_(0)
@@ -52,10 +38,15 @@ class RandomGenerator {
 		for (int i = 0; i < 2; i++) {
 			if (CryptAcquireContext(&prov_, NULL, NULL, PROV_RSA_FULL, flagTbl[i]) != 0) return;
 		}
+#ifdef CYBOZU_DONT_USE_EXCEPTION
+		prov_ = 0;
+#else
 		throw cybozu::Exception("randomgenerator");
+#endif
 	}
 	bool read_inner(void *buf, size_t byteSize)
 	{
+		if (prov_ == 0) return false;
 		return CryptGenRandom(prov_, static_cast<DWORD>(byteSize), static_cast<BYTE*>(buf)) != 0;
 	}
 	~RandomGenerator()
@@ -88,13 +79,6 @@ class RandomGenerator {
 		}
 		*pb = true;
 	}
-	template<class T>
-	void read(T *buf, size_t bufNum)
-	{
-		bool b;
-		read(&b, buf, bufNum);
-		if (!b) throw cybozu::Exception("RandomGenerator:read") << bufNum;
-	}
 private:
 	HCRYPTPROV prov_;
 	static const size_t bufSize = 1024;
@@ -105,7 +89,9 @@ class RandomGenerator {
 	RandomGenerator()
 		: fp_(::fopen("/dev/urandom", "rb"))
 	{
+#ifndef CYBOZU_DONT_USE_EXCEPTION
 		if (!fp_) throw cybozu::Exception("randomgenerator");
+#endif
 	}
 	~RandomGenerator()
 	{
@@ -118,9 +104,18 @@ class RandomGenerator {
 	template<class T>
 	void read(bool *pb, T *buf, size_t bufNum)
 	{
+		if (fp_ == 0) {
+			*pb = false;
+			return;
+		}
 		const size_t byteSize = sizeof(T) * bufNum;
 		*pb = ::fread(buf, 1, (int)byteSize, fp_) == byteSize;
 	}
+private:
+	FILE *fp_;
+#endif
+#ifndef CYBOZU_DONT_USE_EXCEPTION
+public:
 	template<class T>
 	void read(T *buf, size_t bufNum)
 	{
@@ -128,9 +123,23 @@ class RandomGenerator {
 		read(&b, buf, bufNum);
 		if (!b) throw cybozu::Exception("RandomGenerator:read") << bufNum;
 	}
+	uint32_t get32()
+	{
+		uint32_t ret;
+		read(&ret, 1);
+		return ret;
+	}
+	uint64_t get64()
+	{
+		uint64_t ret;
+		read(&ret, 1);
+		return ret;
+	}
+	uint32_t operator()()
+	{
+		return get32();
+	}
 #endif
-private:
-	FILE *fp_;
 };
 
 template<class T, class RG>
diff --git a/include/cybozu/sha2.hpp b/include/cybozu/sha2.hpp
index 1830936f02..335a8975e2 100644
--- a/include/cybozu/sha2.hpp
+++ b/include/cybozu/sha2.hpp
@@ -145,24 +145,22 @@ inline uint64_t rot64(uint64_t x, int s)
 
 template<class T>
 struct Common {
-	void term(const char *buf, size_t bufSize)
+	void term(uint8_t *buf, size_t bufSize)
 	{
 		assert(bufSize < T::blockSize_);
 		T& self = static_cast<T&>(*this);
 		const uint64_t totalSize = self.totalSize_ + bufSize;
 
-		uint8_t last[T::blockSize_];
-		memcpy(last, buf, bufSize);
-		last[bufSize] = uint8_t(0x80); /* top bit = 1 */
-		memset(&last[bufSize + 1], 0, T::blockSize_ - bufSize - 1);
+		buf[bufSize] = uint8_t(0x80); /* top bit = 1 */
+		memset(&buf[bufSize + 1], 0, T::blockSize_ - bufSize - 1);
 		if (bufSize >= T::blockSize_ - T::msgLenByte_) {
-			self.round(reinterpret_cast<const char*>(last));
-			memset(last, 0, sizeof(last)); // clear stack
+			self.round(buf);
+			memset(buf, 0, T::blockSize_ - 8); // clear stack
 		}
-		cybozu::Set64bitAsBE(&last[T::blockSize_ - 8], totalSize * 8);
-		self.round(reinterpret_cast<const char*>(last));
+		cybozu::Set64bitAsBE(&buf[T::blockSize_ - 8], totalSize * 8);
+		self.round(buf);
 	}
-	void inner_update(const char *buf, size_t bufSize)
+	void inner_update(const uint8_t *buf, size_t bufSize)
 	{
 		T& self = static_cast<T&>(*this);
 		if (bufSize == 0) return;
@@ -203,15 +201,35 @@ class Sha256 : public sha2_local::Common<Sha256> {
 	static const size_t msgLenByte_ = 8;
 	uint64_t totalSize_;
 	size_t roundBufSize_;
-	char roundBuf_[blockSize_];
+	uint8_t roundBuf_[blockSize_];
 	uint32_t h_[hSize_];
 	static const size_t outByteSize_ = hSize_ * sizeof(uint32_t);
 	const uint32_t *k_;
 
+	template<size_t i0, size_t i1, size_t i2, size_t i3, size_t i4, size_t i5, size_t i6, size_t i7>
+	void round1(uint32_t *s, uint32_t *w, int i)
+	{
+		using namespace sha2_local;
+		uint32_t e = s[i4];
+		uint32_t h = s[i7];
+		h += rot32(e, 6) ^ rot32(e, 11) ^ rot32(e, 25);
+		uint32_t f = s[i5];
+		uint32_t g = s[i6];
+		h += g ^ (e & (f ^ g));
+		h += k_[i];
+		h += w[i];
+		s[i3] += h;
+		uint32_t a = s[i0];
+		uint32_t b = s[i1];
+		uint32_t c = s[i2];
+		h += rot32(a, 2) ^ rot32(a, 13) ^ rot32(a, 22);
+		h += ((a | b) & c) | (a & b);
+		s[i7] = h;
+	}
 	/**
 		@param buf [in] buffer(64byte)
 	*/
-	void round(const char *buf)
+	void round(const uint8_t *buf)
 	{
 		using namespace sha2_local;
 		uint32_t w[64];
@@ -225,38 +243,23 @@ class Sha256 : public sha2_local::Common<Sha256> {
 			uint32_t s1 = rot32(t, 17) ^ rot32(t, 19) ^ (t >> 10);
 			w[i] = w[i - 16] + s0 + w[i - 7] + s1;
 		}
-		uint32_t a = h_[0];
-		uint32_t b = h_[1];
-		uint32_t c = h_[2];
-		uint32_t d = h_[3];
-		uint32_t e = h_[4];
-		uint32_t f = h_[5];
-		uint32_t g = h_[6];
-		uint32_t h = h_[7];
-		for (int i = 0; i < 64; i++) {
-			uint32_t s1 = rot32(e, 6) ^ rot32(e, 11) ^ rot32(e, 25);
-			uint32_t ch = g ^ (e & (f ^ g));
-			uint32_t t1 = h + s1 + ch + k_[i] + w[i];
-			uint32_t s0 = rot32(a, 2) ^ rot32(a, 13) ^ rot32(a, 22);
-			uint32_t maj = ((a | b) & c) | (a & b);
-			uint32_t t2 = s0 + maj;
-			h = g;
-			g = f;
-			f = e;
-			e = d + t1;
-			d = c;
-			c = b;
-			b = a;
-			a = t1 + t2;
+		uint32_t s[8];
+		for (int i = 0; i < 8; i++) {
+			s[i] = h_[i];
+		}
+		for (int i = 0; i < 64; i += 8) {
+			round1<0, 1, 2, 3, 4, 5, 6, 7>(s, w, i + 0);
+			round1<7, 0, 1, 2, 3, 4, 5, 6>(s, w, i + 1);
+			round1<6, 7, 0, 1, 2, 3, 4, 5>(s, w, i + 2);
+			round1<5, 6, 7, 0, 1, 2, 3, 4>(s, w, i + 3);
+			round1<4, 5, 6, 7, 0, 1, 2, 3>(s, w, i + 4);
+			round1<3, 4, 5, 6, 7, 0, 1, 2>(s, w, i + 5);
+			round1<2, 3, 4, 5, 6, 7, 0, 1>(s, w, i + 6);
+			round1<1, 2, 3, 4, 5, 6, 7, 0>(s, w, i + 7);
+		}
+		for (int i = 0; i < 8; i++) {
+			h_[i] += s[i];
 		}
-		h_[0] += a;
-		h_[1] += b;
-		h_[2] += c;
-		h_[3] += d;
-		h_[4] += e;
-		h_[5] += f;
-		h_[6] += g;
-		h_[7] += h;
 		totalSize_ += blockSize_;
 	}
 public:
@@ -290,7 +293,7 @@ class Sha256 : public sha2_local::Common<Sha256> {
 	}
 	void update(const void *buf, size_t bufSize)
 	{
-		inner_update(reinterpret_cast<const char*>(buf), bufSize);
+		inner_update(reinterpret_cast<const uint8_t*>(buf), bufSize);
 	}
 	size_t digest(void *md, size_t mdSize, const void *buf, size_t bufSize)
 	{
@@ -329,7 +332,7 @@ class Sha512 : public sha2_local::Common<Sha512> {
 	static const size_t msgLenByte_ = 16;
 	uint64_t totalSize_;
 	size_t roundBufSize_;
-	char roundBuf_[blockSize_];
+	uint8_t roundBuf_[blockSize_];
 	uint64_t h_[hSize_];
 	static const size_t outByteSize_ = hSize_ * sizeof(uint64_t);
 	const uint64_t *k_;
@@ -359,7 +362,7 @@ class Sha512 : public sha2_local::Common<Sha512> {
 	/**
 		@param buf [in] buffer(64byte)
 	*/
-	void round(const char *buf)
+	void round(const uint8_t *buf)
 	{
 		using namespace sha2_local;
 		uint64_t w[80];
@@ -431,7 +434,7 @@ class Sha512 : public sha2_local::Common<Sha512> {
 	}
 	void update(const void *buf, size_t bufSize)
 	{
-		inner_update(reinterpret_cast<const char*>(buf), bufSize);
+		inner_update(reinterpret_cast<const uint8_t*>(buf), bufSize);
 	}
 	size_t digest(void *md, size_t mdSize, const void *buf, size_t bufSize)
 	{
diff --git a/include/mcl/bn.h b/include/mcl/bn.h
index c130004ac5..68053cbf8f 100644
--- a/include/mcl/bn.h
+++ b/include/mcl/bn.h
@@ -34,9 +34,11 @@
 	#ifndef MCLBN_NO_AUTOLINK
 		#if MCLBN_FP_UNIT_SIZE == 4
 			#pragma comment(lib, "mclbn256.lib")
-		#elif MCLBN_FP_UNIT_SIZE == 6
+		#elif (MCLBN_FP_UNIT_SIZE == 6) && (MCLBN_FR_UNIT_SIZE == 4)
+			#pragma comment(lib, "mclbn384_256.lib")
+		#elif (MCLBN_FP_UNIT_SIZE == 6) && (MCLBN_FR_UNIT_SIZE == 6)
 			#pragma comment(lib, "mclbn384.lib")
-		#else
+		#elif MCLBN_FP_UNIT_SIZE == 8
 			#pragma comment(lib, "mclbn512.lib")
 		#endif
 	#endif
@@ -68,6 +70,8 @@ typedef struct mclBnFr mclBnFr;
 typedef struct mclBnG1 mclBnG1;
 typedef struct mclBnG2 mclBnG2;
 typedef struct mclBnGT mclBnGT;
+typedef struct mclBnFp mclBnFp;
+typedef struct mclBnFp2 mclBnFp2;
 
 #else
 
@@ -87,6 +91,14 @@ typedef struct {
 	uint64_t d[MCLBN_FP_UNIT_SIZE * 12];
 } mclBnGT;
 
+typedef struct {
+	uint64_t d[MCLBN_FP_UNIT_SIZE];
+} mclBnFp;
+
+typedef struct {
+	mclBnFp d[2];
+} mclBnFp2;
+
 #endif
 
 #include <mcl/curve_type.h>
@@ -102,6 +114,8 @@ enum {
 	mclBls12_CurveFp381 = 5
 };
 
+// return 0xABC which means A.BC
+MCLBN_DLL_API int mclBn_getVersion();
 /*
 	init library
 	@param curve [in] type of bn curve
@@ -146,6 +160,10 @@ MCLBN_DLL_API int mclBn_getG1ByteSize(void);
 	return bytes for serialized Fr
 */
 MCLBN_DLL_API int mclBn_getFrByteSize(void);
+/*
+	return bytes for serialized Fp
+*/
+MCLBN_DLL_API int mclBn_getFpByteSize(void);
 
 /*
 	return decimal string of the order of the curve(=the characteristic of Fr)
@@ -159,6 +177,12 @@ MCLBN_DLL_API mclSize mclBn_getCurveOrder(char *buf, mclSize maxBufSize);
 */
 MCLBN_DLL_API mclSize mclBn_getFieldOrder(char *buf, mclSize maxBufSize);
 
+/*
+	set ETH serialization mode for BLS12-381
+	@param ETHserialization [in] 1:enable,  0:disable
+	@note ignore the flag if curve is not BLS12-381
+*/
+MCLBN_DLL_API void mclBn_setETHserialization(int ETHserialization);
 ////////////////////////////////////////////////
 /*
 	deserialize
@@ -168,6 +192,8 @@ MCLBN_DLL_API mclSize mclBnFr_deserialize(mclBnFr *x, const void *buf, mclSize b
 MCLBN_DLL_API mclSize mclBnG1_deserialize(mclBnG1 *x, const void *buf, mclSize bufSize);
 MCLBN_DLL_API mclSize mclBnG2_deserialize(mclBnG2 *x, const void *buf, mclSize bufSize);
 MCLBN_DLL_API mclSize mclBnGT_deserialize(mclBnGT *x, const void *buf, mclSize bufSize);
+MCLBN_DLL_API mclSize mclBnFp_deserialize(mclBnFp *x, const void *buf, mclSize bufSize);
+MCLBN_DLL_API mclSize mclBnFp2_deserialize(mclBnFp2 *x, const void *buf, mclSize bufSize);
 
 /*
 	serialize
@@ -177,6 +203,8 @@ MCLBN_DLL_API mclSize mclBnFr_serialize(void *buf, mclSize maxBufSize, const mcl
 MCLBN_DLL_API mclSize mclBnG1_serialize(void *buf, mclSize maxBufSize, const mclBnG1 *x);
 MCLBN_DLL_API mclSize mclBnG2_serialize(void *buf, mclSize maxBufSize, const mclBnG2 *x);
 MCLBN_DLL_API mclSize mclBnGT_serialize(void *buf, mclSize maxBufSize, const mclBnGT *x);
+MCLBN_DLL_API mclSize mclBnFp_serialize(void *buf, mclSize maxBufSize, const mclBnFp *x);
+MCLBN_DLL_API mclSize mclBnFp2_serialize(void *buf, mclSize maxBufSize, const mclBnFp2 *x);
 
 /*
 	set string
@@ -190,6 +218,7 @@ MCLBN_DLL_API int mclBnFr_setStr(mclBnFr *x, const char *buf, mclSize bufSize, i
 MCLBN_DLL_API int mclBnG1_setStr(mclBnG1 *x, const char *buf, mclSize bufSize, int ioMode);
 MCLBN_DLL_API int mclBnG2_setStr(mclBnG2 *x, const char *buf, mclSize bufSize, int ioMode);
 MCLBN_DLL_API int mclBnGT_setStr(mclBnGT *x, const char *buf, mclSize bufSize, int ioMode);
+MCLBN_DLL_API int mclBnFp_setStr(mclBnFp *x, const char *buf, mclSize bufSize, int ioMode);
 
 /*
 	buf is terminated by '\0'
@@ -199,16 +228,29 @@ MCLBN_DLL_API mclSize mclBnFr_getStr(char *buf, mclSize maxBufSize, const mclBnF
 MCLBN_DLL_API mclSize mclBnG1_getStr(char *buf, mclSize maxBufSize, const mclBnG1 *x, int ioMode);
 MCLBN_DLL_API mclSize mclBnG2_getStr(char *buf, mclSize maxBufSize, const mclBnG2 *x, int ioMode);
 MCLBN_DLL_API mclSize mclBnGT_getStr(char *buf, mclSize maxBufSize, const mclBnGT *x, int ioMode);
+MCLBN_DLL_API mclSize mclBnFp_getStr(char *buf, mclSize maxBufSize, const mclBnFp *x, int ioMode);
 
 // set zero
 MCLBN_DLL_API void mclBnFr_clear(mclBnFr *x);
+MCLBN_DLL_API void mclBnFp_clear(mclBnFp *x);
+MCLBN_DLL_API void mclBnFp2_clear(mclBnFp2 *x);
 
 // set x to y
 MCLBN_DLL_API void mclBnFr_setInt(mclBnFr *y, mclInt x);
 MCLBN_DLL_API void mclBnFr_setInt32(mclBnFr *y, int x);
 
-// mask buf with (1 << (bitLen(r) - 1)) - 1 if buf >= r
+// x = buf & (1 << bitLen(r)) - 1
+// if (x >= r) x &= (1 << (bitLen(r) - 1)) - 1
+// always return 0
 MCLBN_DLL_API int mclBnFr_setLittleEndian(mclBnFr *x, const void *buf, mclSize bufSize);
+MCLBN_DLL_API int mclBnFp_setLittleEndian(mclBnFp *x, const void *buf, mclSize bufSize);
+
+// set (buf mod r) to x
+// return 0 if bufSize <= (byte size of Fr * 2) else -1
+MCLBN_DLL_API int mclBnFr_setLittleEndianMod(mclBnFr *x, const void *buf, mclSize bufSize);
+// set (buf mod p) to x
+// return 0 if bufSize <= (byte size of Fp * 2) else -1
+MCLBN_DLL_API int mclBnFp_setLittleEndianMod(mclBnFp *x, const void *buf, mclSize bufSize);
 
 // return 1 if true and 0 otherwise
 MCLBN_DLL_API int mclBnFr_isValid(const mclBnFr *x);
@@ -216,6 +258,9 @@ MCLBN_DLL_API int mclBnFr_isEqual(const mclBnFr *x, const mclBnFr *y);
 MCLBN_DLL_API int mclBnFr_isZero(const mclBnFr *x);
 MCLBN_DLL_API int mclBnFr_isOne(const mclBnFr *x);
 
+MCLBN_DLL_API int mclBnFp_isEqual(const mclBnFp *x, const mclBnFp *y);
+MCLBN_DLL_API int mclBnFp2_isEqual(const mclBnFp2 *x, const mclBnFp2 *y);
+
 #ifndef MCL_DONT_USE_CSRPNG
 // return 0 if success
 MCLBN_DLL_API int mclBnFr_setByCSPRNG(mclBnFr *x);
@@ -234,7 +279,12 @@ MCLBN_DLL_API void mclBn_setRandFunc(void *self, unsigned int (*readFunc)(void *
 // hash(s) and set x
 // return 0 if success
 MCLBN_DLL_API int mclBnFr_setHashOf(mclBnFr *x, const void *buf, mclSize bufSize);
+MCLBN_DLL_API int mclBnFp_setHashOf(mclBnFp *x, const void *buf, mclSize bufSize);
 
+// map x to y
+// return 0 if success else -1
+MCLBN_DLL_API int mclBnFp_mapToG1(mclBnG1 *y, const mclBnFp *x);
+MCLBN_DLL_API int mclBnFp2_mapToG2(mclBnG2 *y, const mclBnFp2 *x);
 
 MCLBN_DLL_API void mclBnFr_neg(mclBnFr *y, const mclBnFr *x);
 MCLBN_DLL_API void mclBnFr_inv(mclBnFr *y, const mclBnFr *x);
@@ -350,6 +400,7 @@ MCLBN_DLL_API void mclBn_precomputedMillerLoop2mixed(mclBnGT *f, const mclBnG1 *
 	Lagrange interpolation
 	recover out = y(0) by { (xVec[i], yVec[i]) }
 	return 0 if success else -1
+	@note *out = yVec[0] if k = 1
 	@note k >= 2, xVec[i] != 0, xVec[i] != xVec[j] for i != j
 */
 MCLBN_DLL_API int mclBn_FrLagrangeInterpolation(mclBnFr *out, const mclBnFr *xVec, const mclBnFr *yVec, mclSize k);
@@ -373,6 +424,13 @@ MCLBN_DLL_API int mclBn_G2EvaluatePolynomial(mclBnG2 *out, const mclBnG2 *cVec,
 MCLBN_DLL_API void mclBn_verifyOrderG1(int doVerify);
 MCLBN_DLL_API void mclBn_verifyOrderG2(int doVerify);
 
+/*
+	EXPERIMENTAL
+	only for curve = MCL_SECP* or MCL_NIST*
+	return standard base point of the current elliptic curve
+*/
+MCLBN_DLL_API int mclBnG1_getBasePoint(mclBnG1 *x);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/mcl/bn.hpp b/include/mcl/bn.hpp
index 708e97d89e..5ebe5d956e 100644
--- a/include/mcl/bn.hpp
+++ b/include/mcl/bn.hpp
@@ -315,11 +315,18 @@ struct Compress {
 };
 
 struct MapTo {
+	enum {
+		BNtype,
+		BLS12type,
+		STD_ECtype
+	};
 	Fp c1_; // sqrt(-3)
 	Fp c2_; // (-1 + sqrt(-3)) / 2
 	mpz_class z_;
 	mpz_class cofactor_;
-	bool isBN_;
+	int type_;
+	bool useNaiveMapTo_;
+
 	int legendre(bool *pb, const Fp& x) const
 	{
 		mpz_class xx;
@@ -488,27 +495,44 @@ struct MapTo {
 		(void)b;
 		c2_ = (c1_ - 1) / 2;
 	}
-	void init(const mpz_class& cofactor, const mpz_class &z, bool isBN, int curveType = -1)
+	/*
+		if type == STD_ECtype, then cofactor, z are not used.
+	*/
+	void init(const mpz_class& cofactor, const mpz_class &z, int curveType)
 	{
-		isBN_ = isBN;
-		if (isBN_) {
-			initBN(cofactor, z, curveType);
+		if (0 <= curveType && curveType < MCL_EC_BEGIN) {
+			type_ = curveType == MCL_BLS12_381 ? BLS12type : BNtype;
+		} else {
+			type_ = STD_ECtype;
+		}
+		if (type_ == STD_ECtype) {
+			useNaiveMapTo_ = true;
 		} else {
+			useNaiveMapTo_ = false;
+		}
+#ifdef MCL_USE_OLD_MAPTO_FOR_BLS12
+		if (type == BLS12type) useNaiveMapTo_ = true;
+#endif
+		if (type_ == BNtype) {
+			initBN(cofactor, z, curveType);
+		} else if (type_ == BLS12type) {
 			initBLS12(z);
 		}
 	}
 	bool calcG1(G1& P, const Fp& t) const
 	{
-		if (isBN_) {
-			if (!calcBN<G1, Fp>(P, t)) return false;
-			// no subgroup
-		} else {
-#ifdef MCL_USE_OLD_MAPTO_FOR_BLS12
+		if (useNaiveMapTo_) {
 			naiveMapTo<G1, Fp>(P, t);
-#else
+		} else {
 			if (!calcBN<G1, Fp>(P, t)) return false;
-#endif
+		}
+		switch (type_) {
+		case BNtype:
+			// no subgroup
+			break;
+		case BLS12type:
 			mulByCofactorBLS12(P, P);
+			break;
 		}
 		assert(P.isValid());
 		return true;
@@ -518,16 +542,18 @@ struct MapTo {
 	*/
 	bool calcG2(G2& P, const Fp2& t) const
 	{
-		if (isBN_) {
-			if (!calcBN<G2, Fp2>(P, t)) return false;
-			mulByCofactorBN(P, P);
-		} else {
-#ifdef MCL_USE_OLD_MAPTO_FOR_BLS12
+		if (useNaiveMapTo_) {
 			naiveMapTo<G2, Fp2>(P, t);
-#else
+		} else {
 			if (!calcBN<G2, Fp2>(P, t)) return false;
-#endif
+		}
+		switch(type_) {
+		case BNtype:
+			mulByCofactorBN(P, P);
+			break;
+		case BLS12type:
 			mulByCofactorBLS12(P, P);
+			break;
 		}
 		assert(P.isValid());
 		return true;
@@ -1018,6 +1044,9 @@ struct Param {
 	bool useNAF;
 	local::SignVec zReplTbl;
 
+	// for initG1only
+	G1 basePoint;
+
 	void init(bool *pb, const mcl::CurveParam& cp, fp::Mode mode)
 	{
 		this->cp = cp;
@@ -1099,14 +1128,31 @@ struct Param {
 		}
 */
 		if (isBLS12) {
-			mapTo.init(0, z, false);
+			mapTo.init(0, z, cp.curveType);
 		} else {
-			mapTo.init(2 * p - r, z, true, cp.curveType);
+			mapTo.init(2 * p - r, z, cp.curveType);
 		}
 		glv1.init(r, z, isBLS12, cp.curveType);
 		glv2.init(r, z, isBLS12);
+		basePoint.clear();
 		*pb = true;
 	}
+	void initG1only(bool *pb, const mcl::EcParam& para)
+	{
+		Fp::init(pb, para.p);
+		if (!*pb) return;
+		Fr::init(pb, para.n);
+		if (!*pb) return;
+		G1::init(pb, para.a, para.b);
+		if (!*pb) return;
+		G1::setOrder(Fr::getOp().mp);
+		mapTo.init(0, 0, para.curveType);
+		Fp x0, y0;
+		x0.setStr(pb, para.gx);
+		if (!*pb) return;
+		y0.setStr(pb, para.gy);
+		basePoint.set(pb, x0, y0);
+	}
 #ifndef CYBOZU_DONT_USE_EXCEPTION
 	void init(const mcl::CurveParam& cp, fp::Mode mode)
 	{
@@ -2195,5 +2241,21 @@ inline void initPairing(const mcl::CurveParam& cp = mcl::BN254, fp::Mode mode =
 }
 #endif
 
+inline void initG1only(bool *pb, const mcl::EcParam& para)
+{
+	local::StaticVar<>::param.initG1only(pb, para);
+	if (!*pb) return;
+	G1::setMulArrayGLV(0);
+	G2::setMulArrayGLV(0);
+	Fp12::setPowArrayGLV(0);
+	G1::setCompressedExpression();
+	G2::setCompressedExpression();
+}
+
+inline const G1& getG1basePoint()
+{
+	return local::StaticVar<>::param.basePoint;
+}
+
 } } // mcl::bn
 
diff --git a/include/mcl/bn_c256.h b/include/mcl/bn_c256.h
new file mode 100644
index 0000000000..5135e788c7
--- /dev/null
+++ b/include/mcl/bn_c256.h
@@ -0,0 +1,11 @@
+#pragma once
+/**
+	@file
+	@brief C API of 256-bit optimal ate pairing over BN curves
+	@author MITSUNARI Shigeo(@herumi)
+	@license modified new BSD license
+	http://opensource.org/licenses/BSD-3-Clause
+*/
+#define MCLBN_FP_UNIT_SIZE 4
+#include <mcl/bn.h>
+
diff --git a/include/mcl/bn_c384.h b/include/mcl/bn_c384.h
new file mode 100644
index 0000000000..23459e0035
--- /dev/null
+++ b/include/mcl/bn_c384.h
@@ -0,0 +1,12 @@
+#pragma once
+/**
+	@file
+	@brief C API of 384-bit optimal ate pairing over BN curves
+	@author MITSUNARI Shigeo(@herumi)
+	@license modified new BSD license
+	http://opensource.org/licenses/BSD-3-Clause
+*/
+#define MCLBN_FP_UNIT_SIZE 6
+#define MCLBN_FR_UNIT_SIZE 6
+#include <mcl/bn.h>
+
diff --git a/include/mcl/bn_c384_256.h b/include/mcl/bn_c384_256.h
new file mode 100644
index 0000000000..bb7045ab9c
--- /dev/null
+++ b/include/mcl/bn_c384_256.h
@@ -0,0 +1,12 @@
+#pragma once
+/**
+	@file
+	@brief C API of 384/256-bit optimal ate pairing over BN curves
+	@author MITSUNARI Shigeo(@herumi)
+	@license modified new BSD license
+	http://opensource.org/licenses/BSD-3-Clause
+*/
+#define MCLBN_FP_UNIT_SIZE 6
+#define MCLBN_FR_UNIT_SIZE 4
+#include <mcl/bn.h>
+
diff --git a/include/mcl/curve_type.h b/include/mcl/curve_type.h
index 5957d1ae80..9e4a941a0a 100644
--- a/include/mcl/curve_type.h
+++ b/include/mcl/curve_type.h
@@ -14,5 +14,22 @@ enum {
 	MCL_BN462 = 3,
 	MCL_BN_SNARK1 = 4,
 	MCL_BLS12_381 = 5,
-	MCL_BN160 = 6
+	MCL_BN160 = 6,
+
+	/*
+		for only G1
+		the size of curve must be less or equal to MCLBN_FP_UNIT_SIZE
+	*/
+	MCL_EC_BEGIN = 100,
+	MCL_SECP192K1 = MCL_EC_BEGIN,
+	MCL_SECP224K1 = 101,
+	MCL_SECP256K1 = 102,
+	MCL_SECP384R1 = 103,
+	MCL_SECP521R1 = 104,
+	MCL_NIST_P192 = 105,
+	MCL_NIST_P224 = 106,
+	MCL_NIST_P256 = 107,
+	MCL_EC_END = MCL_NIST_P256 + 1,
+	MCL_NIST_P384 = MCL_SECP384R1,
+	MCL_NIST_P521 = MCL_SECP521R1
 };
diff --git a/include/mcl/ec.hpp b/include/mcl/ec.hpp
index 8ebf7e7572..ad6e6db447 100644
--- a/include/mcl/ec.hpp
+++ b/include/mcl/ec.hpp
@@ -20,14 +20,35 @@
 
 namespace mcl {
 
+template<class _Fp> class Fp2T;
+
 namespace ec {
 
 enum Mode {
-	Jacobi,
-	Proj
+	Jacobi = 0,
+	Proj = 1
 };
 
-} // mcl::ecl
+namespace local {
+
+// x is negative <=> x < half(:=(p+1)/2) <=> a = 1
+template<class Fp>
+bool get_a_flag(const Fp& x)
+{
+	return x.isNegative();
+}
+
+// Im(x) is negative <=> Im(x)  < half(:=(p+1)/2) <=> a = 1
+
+template<class Fp>
+bool get_a_flag(const mcl::Fp2T<Fp>& x)
+{
+	return get_a_flag(x.b); // x = a + bi
+}
+
+} // mcl::ec::local
+
+} // mcl::ec
 
 /*
 	elliptic curve
@@ -423,27 +444,41 @@ class EcT : public fp::Serializable<EcT<_Fp> > {
 		dblNoVerifyInf(R, P);
 	}
 #ifndef MCL_EC_USE_AFFINE
-	static inline void addJacobi(EcT& R, const EcT& P, const EcT& Q)
+	static inline void addJacobi(EcT& R, const EcT& P, const EcT& Q, bool isPzOne, bool isQzOne)
 	{
-		const bool isQzOne = Q.z.isOne();
 		Fp r, U1, S1, H, H3;
-		Fp::sqr(r, P.z);
+		if (isPzOne) {
+			// r = 1;
+		} else {
+			Fp::sqr(r, P.z);
+		}
 		if (isQzOne) {
 			U1 = P.x;
-			Fp::mul(H, Q.x, r);
+			if (isPzOne) {
+				H = Q.x;
+			} else {
+				Fp::mul(H, Q.x, r);
+			}
 			H -= U1;
-			r *= P.z;
 			S1 = P.y;
 		} else {
 			Fp::sqr(S1, Q.z);
 			Fp::mul(U1, P.x, S1);
-			Fp::mul(H, Q.x, r);
+			if (isPzOne) {
+				H = Q.x;
+			} else {
+				Fp::mul(H, Q.x, r);
+			}
 			H -= U1;
-			r *= P.z;
 			S1 *= Q.z;
 			S1 *= P.y;
 		}
-		r *= Q.y;
+		if (isPzOne) {
+			r = Q.y;
+		} else {
+			r *= P.z;
+			r *= Q.y;
+		}
 		r -= S1;
 		if (H.isZero()) {
 			if (r.isZero()) {
@@ -453,11 +488,13 @@ class EcT : public fp::Serializable<EcT<_Fp> > {
 			}
 			return;
 		}
-		if (isQzOne) {
-			Fp::mul(R.z, P.z, H);
+		if (isPzOne) {
+			R.z = H;
 		} else {
-			Fp::mul(R.z, P.z, Q.z);
-			R.z *= H;
+			Fp::mul(R.z, P.z, H);
+		}
+		if (!isQzOne) {
+			R.z *= Q.z;
 		}
 		Fp::sqr(H3, H); // H^2
 		Fp::sqr(R.y, r); // r^2
@@ -471,9 +508,8 @@ class EcT : public fp::Serializable<EcT<_Fp> > {
 		H3 *= S1;
 		Fp::sub(R.y, U1, H3);
 	}
-	static inline void addProj(EcT& R, const EcT& P, const EcT& Q)
+	static inline void addProj(EcT& R, const EcT& P, const EcT& Q, bool isPzOne, bool isQzOne)
 	{
-		const bool isQzOne = Q.z.isOne();
 		Fp r, PyQz, v, A, vv;
 		if (isQzOne) {
 			r = P.x;
@@ -482,8 +518,13 @@ class EcT : public fp::Serializable<EcT<_Fp> > {
 			Fp::mul(r, P.x, Q.z);
 			Fp::mul(PyQz, P.y, Q.z);
 		}
-		Fp::mul(A, Q.y, P.z);
-		Fp::mul(v, Q.x, P.z);
+		if (isPzOne) {
+			A = Q.y;
+			v = Q.x;
+		} else {
+			Fp::mul(A, Q.y, P.z);
+			Fp::mul(v, Q.x, P.z);
+		}
 		v -= r;
 		if (v.isZero()) {
 			if (A == PyQz) {
@@ -501,10 +542,19 @@ class EcT : public fp::Serializable<EcT<_Fp> > {
 		if (isQzOne) {
 			R.z = P.z;
 		} else {
-			Fp::mul(R.z, P.z, Q.z);
+			if (isPzOne) {
+				R.z = Q.z;
+			} else {
+				Fp::mul(R.z, P.z, Q.z);
+			}
+		}
+		// R.z = 1 if isPzOne && isQzOne
+		if (isPzOne && isQzOne) {
+			R.z = vv;
+		} else {
+			A *= R.z;
+			R.z *= vv;
 		}
-		A *= R.z;
-		R.z *= vv;
 		A -= vv;
 		vv *= PyQz;
 		A -= r;
@@ -515,17 +565,14 @@ class EcT : public fp::Serializable<EcT<_Fp> > {
 		R.y -= vv;
 	}
 #endif
-	static inline void add(EcT& R, const EcT& P0, const EcT& Q0)
-	{
-		if (P0.isZero()) { R = Q0; return; }
-		if (Q0.isZero()) { R = P0; return; }
-		if (&P0 == &Q0) {
-			dblNoVerifyInf(R, P0);
+	static inline void add(EcT& R, const EcT& P, const EcT& Q) {
+		if (P.isZero()) { R = Q; return; }
+		if (Q.isZero()) { R = P; return; }
+		if (&P == &Q) {
+			dblNoVerifyInf(R, P);
 			return;
 		}
 #ifdef MCL_EC_USE_AFFINE
-		const EcT& P(P0);
-		const EcT& Q(Q0);
 		Fp t;
 		Fp::neg(t, Q.y);
 		if (P.y == t) { R.clear(); return; }
@@ -547,19 +594,14 @@ class EcT : public fp::Serializable<EcT<_Fp> > {
 		Fp::sub(R.y, s, P.y);
 		R.x = x3;
 #else
-		const EcT *pP = &P0;
-		const EcT *pQ = &Q0;
-		if (pP->z.isOne()) {
-			fp::swap_(pP, pQ);
-		}
-		const EcT& P(*pP);
-		const EcT& Q(*pQ);
+		bool isPzOne = P.z.isOne();
+		bool isQzOne = Q.z.isOne();
 		switch (mode_) {
 		case ec::Jacobi:
-			addJacobi(R, P, Q);
+			addJacobi(R, P, Q, isPzOne, isQzOne);
 			break;
 		case ec::Proj:
-			addProj(R, P, Q);
+			addProj(R, P, Q, isPzOne, isQzOne);
 			break;
 		}
 #endif
@@ -687,28 +729,44 @@ class EcT : public fp::Serializable<EcT<_Fp> > {
 		EcT P(*this);
 		P.normalize();
 		if (ioMode & (IoSerialize | IoSerializeHexStr)) {
-			/*
-				if (isMSBserialize()) {
-				  // n bytes
-				  x | (y.isOdd ? 0x80 : 0)
-				} else {
-				  // n + 1 bytes
-				  (y.isOdd ? 3 : 2), x
-				}
-			*/
 			const size_t n = Fp::getByteSize();
 			const size_t adj = isMSBserialize() ? 0 : 1;
-			char buf[sizeof(Fp) + 1];
-			if (isZero()) {
-				memset(buf, 0, n + adj);
+			uint8_t buf[sizeof(Fp) + 1];
+			if (Fp::BaseFp::isETHserialization()) {
+				const uint8_t c_flag = 0x80;
+				const uint8_t b_flag = 0x40;
+				const uint8_t a_flag = 0x20;
+				if (P.isZero()) {
+					buf[0] = c_flag | b_flag;
+					memset(buf + 1, 0, n - 1);
+				} else {
+					cybozu::MemoryOutputStream mos(buf, n);
+					P.x.save(pb, mos, IoSerialize); if (!*pb) return;
+					uint8_t cba = c_flag;
+					if (ec::local::get_a_flag(P.y)) cba |= a_flag;
+					buf[0] |= cba;
+				}
 			} else {
-				cybozu::MemoryOutputStream mos(buf + adj, n);
-				P.x.save(pb, mos, IoSerialize); if (!*pb) return;
-				if (adj) {
-					buf[0] = P.y.isOdd() ? 3 : 2;
+				/*
+					if (isMSBserialize()) {
+					  // n bytes
+					  x | (y.isOdd ? 0x80 : 0)
+					} else {
+					  // n + 1 bytes
+					  (y.isOdd ? 3 : 2), x
+					}
+				*/
+				if (isZero()) {
+					memset(buf, 0, n + adj);
 				} else {
-					if (P.y.isOdd()) {
-						buf[n - 1] |= 0x80;
+					cybozu::MemoryOutputStream mos(buf + adj, n);
+					P.x.save(pb, mos, IoSerialize); if (!*pb) return;
+					if (adj) {
+						buf[0] = P.y.isOdd() ? 3 : 2;
+					} else {
+						if (P.y.isOdd()) {
+							buf[n - 1] |= 0x80;
+						}
 					}
 				}
 			}
@@ -757,7 +815,7 @@ class EcT : public fp::Serializable<EcT<_Fp> > {
 			const size_t n = Fp::getByteSize();
 			const size_t adj = isMSBserialize() ? 0 : 1;
 			const size_t n1 = n + adj;
-			char buf[sizeof(Fp) + 1];
+			uint8_t buf[sizeof(Fp) + 1];
 			size_t readSize;
 			if (ioMode & IoSerializeHexStr) {
 				readSize = mcl::fp::readHexStr(buf, n1, is);
@@ -768,6 +826,38 @@ class EcT : public fp::Serializable<EcT<_Fp> > {
 				*pb = false;
 				return;
 			}
+			if (Fp::BaseFp::isETHserialization()) {
+				const uint8_t c_flag = 0x80;
+				const uint8_t b_flag = 0x40;
+				const uint8_t a_flag = 0x20;
+				*pb = false;
+				if ((buf[0] & c_flag) == 0) { // assume compressed
+					return;
+				}
+				if (buf[0] & b_flag) { // infinity
+					if (buf[0] != (c_flag | b_flag)) return;
+					for (size_t i = 1; i < n - 1; i++) {
+						if (buf[i]) return;
+					}
+					clear();
+					*pb = true;
+					return;
+				}
+				bool a = (buf[0] & a_flag) != 0;
+				buf[0] &= ~(c_flag | b_flag | a_flag);
+				mcl::fp::local::byteSwap(buf, n);
+				x.setArray(pb, buf, n);
+				if (!*pb) return;
+				getWeierstrass(y, x);
+				if (!Fp::squareRoot(y, y)) {
+					*pb = false;
+					return;
+				}
+				if (ec::local::get_a_flag(y) ^ a) {
+					Fp::neg(y, y);
+				}
+				return;
+			}
 			if (fp::isZeroArray(buf, n1)) {
 				clear();
 				*pb = true;
@@ -889,6 +979,10 @@ class EcT : public fp::Serializable<EcT<_Fp> > {
 	bool operator<=(const EcT& rhs) const { return !operator>(rhs); }
 	static inline void mulArray(EcT& z, const EcT& x, const fp::Unit *y, size_t yn, bool isNegative, bool constTime = false)
 	{
+		if (!constTime && x.isZero()) {
+			z.clear();
+			return;
+		}
 		if (mulArrayGLV && (constTime || yn > 1)) {
 			mulArrayGLV(z, x, y, yn, isNegative, constTime);
 			return;
@@ -983,6 +1077,7 @@ struct EcParam {
 	const char *gy;
 	const char *n;
 	size_t bitSize; // bit length of p
+	int curveType;
 };
 
 } // mcl
diff --git a/include/mcl/ecparam.hpp b/include/mcl/ecparam.hpp
index 19b76bf556..087bf8b6c0 100644
--- a/include/mcl/ecparam.hpp
+++ b/include/mcl/ecparam.hpp
@@ -7,6 +7,7 @@
 	http://opensource.org/licenses/BSD-3-Clause
 */
 #include <mcl/ec.hpp>
+#include <mcl/curve_type.h>
 
 namespace mcl { namespace ecparam {
 
@@ -18,7 +19,8 @@ const struct mcl::EcParam secp160k1 = {
 	"0x3b4c382ce37aa192a4019e763036f4f5dd4d7ebb",
 	"0x938cf935318fdced6bc28286531733c3f03c4fee",
 	"0x100000000000000000001b8fa16dfab9aca16b6b3",
-	160
+	160,
+	-1
 };
 // p=2^160 + 7
 const struct mcl::EcParam p160_1 = {
@@ -29,7 +31,8 @@ const struct mcl::EcParam p160_1 = {
 	"1",
 	"1236612389951462151661156731535316138439983579284",
 	"1461501637330902918203683518218126812711137002561",
-	161
+	161,
+	-1
 };
 const struct mcl::EcParam secp192k1 = {
 	"secp192k1",
@@ -39,7 +42,8 @@ const struct mcl::EcParam secp192k1 = {
 	"0xdb4ff10ec057e9ae26b07d0280b7f4341da5d1b1eae06c7d",
 	"0x9b2f2f6d9c5628a7844163d015be86344082aa88d95e2f9d",
 	"0xfffffffffffffffffffffffe26f2fc170f69466a74defd8d",
-	192
+	192,
+	MCL_SECP192K1
 };
 const struct mcl::EcParam secp224k1 = {
 	"secp224k1",
@@ -49,7 +53,8 @@ const struct mcl::EcParam secp224k1 = {
 	"0xa1455b334df099df30fc28a169a467e9e47075a90f7e650eb6b7a45c",
 	"0x7e089fed7fba344282cafbd6f7e319f7c0b0bd59e2ca4bdb556d61a5",
 	"0x10000000000000000000000000001dce8d2ec6184caf0a971769fb1f7",
-	224
+	224,
+	MCL_SECP224K1
 };
 const struct mcl::EcParam secp256k1 = {
 	"secp256k1",
@@ -59,7 +64,8 @@ const struct mcl::EcParam secp256k1 = {
 	"0x79be667ef9dcbbac55a06295ce870b07029bfcdb2dce28d959f2815b16f81798",
 	"0x483ada7726a3c4655da4fbfc0e1108a8fd17b448a68554199c47d08ffb10d4b8",
 	"0xfffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141",
-	256
+	256,
+	MCL_SECP256K1
 };
 const struct mcl::EcParam secp384r1 = {
 	"secp384r1",
@@ -69,7 +75,8 @@ const struct mcl::EcParam secp384r1 = {
 	"0xaa87ca22be8b05378eb1c71ef320ad746e1d3b628ba79b9859f741e082542a385502f25dbf55296c3a545e3872760ab7",
 	"0x3617de4a96262c6f5d9e98bf9292dc29f8f41dbd289a147ce9da3113b5f0b8c00a60b1ce1d7e819d7a431d7c90ea0e5f",
 	"0xffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973",
-	384
+	384,
+	MCL_SECP384R1
 };
 const struct mcl::EcParam secp521r1 = {
 	"secp521r1",
@@ -79,7 +86,8 @@ const struct mcl::EcParam secp521r1 = {
 	"0xc6858e06b70404e9cd9e3ecb662395b4429c648139053fb521f828af606b4d3dbaa14b5e77efe75928fe1dc127a2ffa8de3348b3c1856a429bf97e7e31c2e5bd66",
 	"0x11839296a789a3bc0045c8a5fb42c7d1bd998f54449579b446817afbd17273e662c97ee72995ef42640c550b9013fad0761353c7086a272c24088be94769fd16650",
 	"0x1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb71e91386409",
-	521
+	521,
+	MCL_SECP521R1
 };
 const struct mcl::EcParam NIST_P192 = {
 	"NIST_P192",
@@ -89,7 +97,8 @@ const struct mcl::EcParam NIST_P192 = {
 	"0x188da80eb03090f67cbf20eb43a18800f4ff0afd82ff1012",
 	"0x07192b95ffc8da78631011ed6b24cdd573f977a11e794811",
 	"0xffffffffffffffffffffffff99def836146bc9b1b4d22831",
-	192
+	192,
+	MCL_NIST_P192
 };
 const struct mcl::EcParam NIST_P224 = {
 	"NIST_P224",
@@ -99,7 +108,8 @@ const struct mcl::EcParam NIST_P224 = {
 	"0xb70e0cbd6bb4bf7f321390b94a03c1d356c21122343280d6115c1d21",
 	"0xbd376388b5f723fb4c22dfe6cd4375a05a07476444d5819985007e34",
 	"0xffffffffffffffffffffffffffff16a2e0b8f03e13dd29455c5c2a3d",
-	224
+	224,
+	MCL_NIST_P224
 };
 const struct mcl::EcParam NIST_P256 = {
 	"NIST_P256",
@@ -109,7 +119,8 @@ const struct mcl::EcParam NIST_P256 = {
 	"0x6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296",
 	"0x4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5",
 	"0xffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551",
-	256
+	256,
+	MCL_NIST_P256
 };
 // same secp384r1
 const struct mcl::EcParam NIST_P384 = {
@@ -120,7 +131,8 @@ const struct mcl::EcParam NIST_P384 = {
 	"0xaa87ca22be8b05378eb1c71ef320ad746e1d3b628ba79b9859f741e082542a385502f25dbf55296c3a545e3872760ab7",
 	"0x3617de4a96262c6f5d9e98bf9292dc29f8f41dbd289a147ce9da3113b5f0b8c00a60b1ce1d7e819d7a431d7c90ea0e5f",
 	"0xffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973",
-	384
+	384,
+	MCL_NIST_P384
 };
 // same secp521r1
 const struct mcl::EcParam NIST_P521 = {
@@ -131,7 +143,8 @@ const struct mcl::EcParam NIST_P521 = {
 	"0xc6858e06b70404e9cd9e3ecb662395b4429c648139053fb521f828af606b4d3dbaa14b5e77efe75928fe1dc127a2ffa8de3348b3c1856a429bf97e7e31c2e5bd66",
 	"0x11839296a789a3bc0045c8a5fb42c7d1bd998f54449579b446817afbd17273e662c97ee72995ef42640c550b9013fad0761353c7086a272c24088be94769fd16650",
 	"0x1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb71e91386409",
-	521
+	521,
+	MCL_NIST_P521
 };
 
 } // mcl::ecparam
@@ -161,4 +174,18 @@ static inline const mcl::EcParam* getEcParam(const std::string& name)
 }
 #endif
 
+inline const mcl::EcParam* getEcParam(int curve)
+{
+	switch (curve) {
+	case MCL_SECP192K1: return &ecparam::secp192k1;
+	case MCL_SECP224K1: return &ecparam::secp224k1;
+	case MCL_SECP256K1: return &ecparam::secp256k1;
+	case MCL_SECP384R1: return &ecparam::secp384r1;
+	case MCL_NIST_P192: return &ecparam::NIST_P192;
+	case MCL_NIST_P224: return &ecparam::NIST_P224;
+	case MCL_NIST_P256: return &ecparam::NIST_P256;
+	default: return 0;
+	}
+}
+
 } // mcl
diff --git a/include/mcl/elgamal.hpp b/include/mcl/elgamal.hpp
index 431148508a..9cf29198ab 100644
--- a/include/mcl/elgamal.hpp
+++ b/include/mcl/elgamal.hpp
@@ -124,26 +124,26 @@ struct ElgamalT {
 		http://dx.doi.org/10.1587/transfun.E96.A.1156
 	*/
 	struct Zkp {
-		Zn c0, c1, s0, s1;
+		Zn c[2], s[2];
 		template<class InputStream>
 		void load(InputStream& is, int ioMode = IoSerialize)
 		{
-			c0.load(is, ioMode);
-			c1.load(is, ioMode);
-			s0.load(is, ioMode);
-			s1.load(is, ioMode);
+			c[0].load(is, ioMode);
+			c[1].load(is, ioMode);
+			s[0].load(is, ioMode);
+			s[1].load(is, ioMode);
 		}
 		template<class OutputStream>
 		void save(OutputStream& os, int ioMode = IoSerialize) const
 		{
 			const char sep = *fp::getIoSeparator(ioMode);
-			c0.save(os, ioMode);
+			c[0].save(os, ioMode);
 			if (sep) cybozu::writeChar(os, sep);
-			c1.save(os, ioMode);
+			c[1].save(os, ioMode);
 			if (sep) cybozu::writeChar(os, sep);
-			s0.save(os, ioMode);
+			s[0].save(os, ioMode);
 			if (sep) cybozu::writeChar(os, sep);
-			s1.save(os, ioMode);
+			s[1].save(os, ioMode);
 		}
 		void getStr(std::string& str, int ioMode = 0) const
 		{
@@ -179,11 +179,9 @@ struct ElgamalT {
 
 	class PublicKey {
 		size_t bitSize;
-		Ec f;
 		Ec g;
 		Ec h;
 		bool enableWindowMethod_;
-		fp::WindowMethod<Ec> wm_f;
 		fp::WindowMethod<Ec> wm_g;
 		fp::WindowMethod<Ec> wm_h;
 		template<class N>
@@ -196,8 +194,6 @@ struct ElgamalT {
 			}
 		}
 		template<class N>
-		void mulF(Ec& z, const N& n) const { mulDispatch(z, f, n, wm_f); }
-		template<class N>
 		void mulG(Ec& z, const N& n) const { mulDispatch(z, g, n, wm_g); }
 		template<class N>
 		void mulH(Ec& z, const N& n) const { mulDispatch(z, h, n, wm_h); }
@@ -209,16 +205,14 @@ struct ElgamalT {
 		}
 		void enableWindowMethod(size_t winSize = 10)
 		{
-			wm_f.init(f, bitSize, winSize);
 			wm_g.init(g, bitSize, winSize);
 			wm_h.init(h, bitSize, winSize);
 			enableWindowMethod_ = true;
 		}
-		const Ec& getF() const { return f; }
-		void init(size_t bitSize, const Ec& f, const Ec& g, const Ec& h)
+		const Ec& getG() const { return g; }
+		void init(size_t bitSize, const Ec& g, const Ec& h)
 		{
 			this->bitSize = bitSize;
-			this->f = f;
 			this->g = g;
 			this->h = h;
 			enableWindowMethod_ = false;
@@ -227,7 +221,7 @@ struct ElgamalT {
 		/*
 			encode message
 			input : m
-			output : c = (c1, c2) = (g^u, h^u f^m)
+			output : c = (c1, c2) = (g^u, h^u g^m)
 		*/
 		void enc(CipherText& c, const Zn& m, fp::RandGen rg = fp::RandGen()) const
 		{
@@ -236,7 +230,7 @@ struct ElgamalT {
 			mulG(c.c1, u);
 			mulH(c.c2, u);
 			Ec t;
-			mulF(t, m);
+			mulG(t, m);
 			Ec::add(c.c2, c.c2, t);
 		}
 		/*
@@ -253,77 +247,58 @@ struct ElgamalT {
 			u.setRand(rg);
 			mulG(c.c1, u);
 			mulH(c.c2, u);
+			Ec t1, t2;
+			Ec R1[2], R2[2];
+			zkp.c[1-m].setRand(rg);
+			zkp.s[1-m].setRand(rg);
+			mulG(t1, zkp.s[1-m]);
+			Ec::mul(t2, c.c1, zkp.c[1-m]);
+			Ec::sub(R1[1-m], t1, t2);
+			mulH(t1, zkp.s[1-m]);
 			if (m) {
-				Ec::add(c.c2, c.c2, f);
-				Zn r1;
-				r1.setRand(rg);
-				zkp.c0.setRand(rg);
-				zkp.s0.setRand(rg);
-				Ec R01, R02, R11, R12;
-				Ec t1, t2;
-				mulG(t1, zkp.s0);
-				Ec::mul(t2, c.c1, zkp.c0);
-				Ec::sub(R01, t1, t2);
-				mulH(t1, zkp.s0);
-				Ec::mul(t2, c.c2, zkp.c0);
-				Ec::sub(R02, t1, t2);
-				mulG(R11, r1);
-				mulH(R12, r1);
-				std::ostringstream os;
-				os << R01 << R02 << R11 << R12 << c.c1 << c.c2 << f << g << h;
-				Zn cc;
-				cc.setHashOf(os.str());
-				zkp.c1 = cc - zkp.c0;
-				zkp.s1 = r1 + zkp.c1 * u;
+				Ec::add(c.c2, c.c2, g);
+				Ec::mul(t2, c.c2, zkp.c[0]);
 			} else {
-				Zn r0;
-				r0.setRand(rg);
-				zkp.c1.setRand(rg);
-				zkp.s1.setRand(rg);
-				Ec R01, R02, R11, R12;
-				mulG(R01, r0);
-				mulH(R02, r0);
-				Ec t1, t2;
-				mulG(t1, zkp.s1);
-				Ec::mul(t2, c.c1, zkp.c1);
-				Ec::sub(R11, t1, t2);
-				mulH(t1, zkp.s1);
-				Ec::sub(t2, c.c2, f);
-				Ec::mul(t2, t2, zkp.c1);
-				Ec::sub(R12, t1, t2);
-				std::ostringstream os;
-				os << R01 << R02 << R11 << R12 << c.c1 << c.c2 << f << g << h;
-				Zn cc;
-				cc.setHashOf(os.str());
-				zkp.c0 = cc - zkp.c1;
-				zkp.s0 = r0 + zkp.c0 * u;
+				Ec::sub(t2, c.c2, g);
+				Ec::mul(t2, t2, zkp.c[1]);
 			}
+			Ec::sub(R2[1-m], t1, t2);
+			Zn r;
+			r.setRand(rg);
+			mulG(R1[m], r);
+			mulH(R2[m], r);
+			std::ostringstream os;
+			os << R1[0] << R2[0] << R1[1] << R2[1] << c.c1 << c.c2 << g << h;
+			Zn cc;
+			cc.setHashOf(os.str());
+			zkp.c[m] = cc - zkp.c[1-m];
+			zkp.s[m] = r + zkp.c[m] * u;
 		}
 		/*
 			verify cipher text with ZKP
 		*/
 		bool verify(const CipherText& c, const Zkp& zkp) const
 		{
-			Ec R01, R02, R11, R12;
+			Ec R1[2], R2[2];
 			Ec t1, t2;
-			mulG(t1, zkp.s0);
-			Ec::mul(t2, c.c1, zkp.c0);
-			Ec::sub(R01, t1, t2);
-			mulH(t1, zkp.s0);
-			Ec::mul(t2, c.c2, zkp.c0);
-			Ec::sub(R02, t1, t2);
-			mulG(t1, zkp.s1);
-			Ec::mul(t2, c.c1, zkp.c1);
-			Ec::sub(R11, t1, t2);
-			mulH(t1, zkp.s1);
-			Ec::sub(t2, c.c2, f);
-			Ec::mul(t2, t2, zkp.c1);
-			Ec::sub(R12, t1, t2);
+			mulG(t1, zkp.s[0]);
+			Ec::mul(t2, c.c1, zkp.c[0]);
+			Ec::sub(R1[0], t1, t2);
+			mulH(t1, zkp.s[0]);
+			Ec::mul(t2, c.c2, zkp.c[0]);
+			Ec::sub(R2[0], t1, t2);
+			mulG(t1, zkp.s[1]);
+			Ec::mul(t2, c.c1, zkp.c[1]);
+			Ec::sub(R1[1], t1, t2);
+			mulH(t1, zkp.s[1]);
+			Ec::sub(t2, c.c2, g);
+			Ec::mul(t2, t2, zkp.c[1]);
+			Ec::sub(R2[1], t1, t2);
 			std::ostringstream os;
-			os << R01 << R02 << R11 << R12 << c.c1 << c.c2 << f << g << h;
+			os << R1[0] << R2[0] << R1[1] << R2[1] << c.c1 << c.c2 << g << h;
 			Zn cc;
 			cc.setHashOf(os.str());
-			return cc == zkp.c0 + zkp.c1;
+			return cc == zkp.c[0] + zkp.c[1];
 		}
 		/*
 			rerandomize encoded message
@@ -343,13 +318,13 @@ struct ElgamalT {
 		/*
 			add encoded message with plain message
 			input : c = Enc(m1) = (c1, c2), m2
-			ouput : c = Enc(m1 + m2) = (c1, c2 f^m2)
+			ouput : c = Enc(m1 + m2) = (c1, c2 g^m2)
 		*/
 		template<class N>
 		void add(CipherText& c, const N& m) const
 		{
 			Ec fm;
-			mulF(fm, m);
+			mulG(fm, m);
 			Ec::add(c.c2, c.c2, fm);
 		}
 		template<class InputStream>
@@ -358,10 +333,9 @@ struct ElgamalT {
 			std::string s;
 			mcl::fp::local::loadWord(s, is);
 			bitSize = cybozu::atoi(s);
-			f.load(is, ioMode);
 			g.load(is, ioMode);
 			h.load(is, ioMode);
-			init(bitSize, f, g, h);
+			init(bitSize, g, h);
 		}
 		template<class OutputStream>
 		void save(OutputStream& os, int ioMode = IoSerialize) const
@@ -371,7 +345,6 @@ struct ElgamalT {
 			cybozu::writeChar(os, ' ');
 
 			const char sep = *fp::getIoSeparator(ioMode);
-			f.save(os, ioMode);
 			if (sep) cybozu::writeChar(os, sep);
 			g.save(os, ioMode);
 			if (sep) cybozu::writeChar(os, sep);
@@ -410,7 +383,7 @@ struct ElgamalT {
 		void fromStr(const std::string& str) { setStr(str); }
 	};
 	/*
-		create table f^i for i in [rangeMin, rangeMax]
+		create table g^i for i in [rangeMin, rangeMax]
 	*/
 	struct PowerCache {
 #if (CYBOZU_CPP_VERSION > CYBOZU_CPP_VERSION_CP03)
@@ -419,18 +392,18 @@ struct ElgamalT {
 		typedef std::map<Ec, int> Cache;
 #endif
 		Cache cache;
-		void init(const Ec& f, int rangeMin, int rangeMax)
+		void init(const Ec& g, int rangeMin, int rangeMax)
 		{
 			if (rangeMin > rangeMax) throw cybozu::Exception("mcl:ElgamalT:PowerCache:bad range") << rangeMin << rangeMax;
 			Ec x;
 			x.clear();
 			cache[x] = 0;
 			for (int i = 1; i <= rangeMax; i++) {
-				Ec::add(x, x, f);
+				Ec::add(x, x, g);
 				cache[x] = i;
 			}
 			Ec nf;
-			Ec::neg(nf, f);
+			Ec::neg(nf, g);
 			x.clear();
 			for (int i = -1; i >= rangeMin; i--) {
 				Ec::add(x, x, nf);
@@ -438,17 +411,17 @@ struct ElgamalT {
 			}
 		}
 		/*
-			return m such that f^m = g
+			return m such that g^m = y
 		*/
-		int getExponent(const Ec& g, bool *b = 0) const
+		int getExponent(const Ec& y, bool *b = 0) const
 		{
-			typename Cache::const_iterator i = cache.find(g);
+			typename Cache::const_iterator i = cache.find(y);
 			if (i == cache.end()) {
 				if (b) {
 					*b = false;
 					return 0;
 				}
-				throw cybozu::Exception("Elgamal:PowerCache:getExponent:not found") << g;
+				throw cybozu::Exception("Elgamal:PowerCache:getExponent:not found") << y;
 			}
 			if (b) *b = true;
 			return i->second;
@@ -469,20 +442,17 @@ struct ElgamalT {
 	public:
 		/*
 			init
-			input : f
-			output : (g, h, z)
-			Ec = <f>
-			g in Ec
+			input : g
+			output : (h, z)
+			Ec = <g>
 			h = g^z
 		*/
-		void init(const Ec& f, size_t bitSize, fp::RandGen rg = fp::RandGen())
+		void init(const Ec& g, size_t bitSize, fp::RandGen rg = fp::RandGen())
 		{
-			Ec g, h;
-			z.setRand(rg);
-			Ec::mul(g, f, z);
+			Ec h;
 			z.setRand(rg);
 			Ec::mul(h, g, z);
-			pub.init(bitSize, f, g, h);
+			pub.init(bitSize, g, h);
 		}
 		const PublicKey& getPublicKey() const { return pub; }
 		/*
@@ -490,12 +460,12 @@ struct ElgamalT {
 			input : c = (c1, c2)
 			output : m
 			M = c2 / c1^z
-			find m such that M = f^m and |m| < limit
+			find m such that M = g^m and |m| < limit
 			@memo 7sec@core i3 for m = 1e6
 		*/
 		void dec(Zn& m, const CipherText& c, int limit = 100000) const
 		{
-			const Ec& f = pub.getF();
+			const Ec& g = pub.getG();
 			Ec c1z;
 			Ec::mul(c1z, c.c1, z);
 			if (c1z == c.c2) {
@@ -505,12 +475,12 @@ struct ElgamalT {
 			Ec t1(c1z);
 			Ec t2(c.c2);
 			for (int i = 1; i < limit; i++) {
-				Ec::add(t1, t1, f);
+				Ec::add(t1, t1, g);
 				if (t1 == c.c2) {
 					m = i;
 					return;
 				}
-				Ec::add(t2, t2, f);
+				Ec::add(t2, t2, g);
 				if (t2 == c1z) {
 					m = -i;
 					return;
@@ -519,20 +489,20 @@ struct ElgamalT {
 			throw cybozu::Exception("elgamal:PrivateKey:dec:overflow");
 		}
 		/*
-			powfm = c2 / c1^z = f^m
+			powgm = c2 / c1^z = g^m
 		*/
-		void getPowerf(Ec& powfm, const CipherText& c) const
+		void getPowerg(Ec& powgm, const CipherText& c) const
 		{
 			Ec c1z;
 			Ec::mul(c1z, c.c1, z);
-			Ec::sub(powfm, c.c2, c1z);
+			Ec::sub(powgm, c.c2, c1z);
 		}
 		/*
 			set range of message to decode quickly
 		*/
 		void setCache(int rangeMin, int rangeMax)
 		{
-			cache.init(pub.getF(), rangeMin, rangeMax);
+			cache.init(pub.getG(), rangeMin, rangeMax);
 		}
 		/*
 			clear cache
@@ -550,9 +520,9 @@ struct ElgamalT {
 		*/
 		int dec(const CipherText& c, bool *b = 0) const
 		{
-			Ec powfm;
-			getPowerf(powfm, c);
-			return cache.getExponent(powfm, b);
+			Ec powgm;
+			getPowerg(powgm, c);
+			return cache.getExponent(powgm, b);
 		}
 		/*
 			check whether c is encrypted zero message
diff --git a/include/mcl/fp.hpp b/include/mcl/fp.hpp
index ffb4699139..8c16468150 100644
--- a/include/mcl/fp.hpp
+++ b/include/mcl/fp.hpp
@@ -73,6 +73,18 @@ bool isEnableJIT(); // 1st call is not threadsafe
 uint32_t sha256(void *out, uint32_t maxOutSize, const void *msg, uint32_t msgSize);
 uint32_t sha512(void *out, uint32_t maxOutSize, const void *msg, uint32_t msgSize);
 
+namespace local {
+
+inline void byteSwap(void *x, size_t n)
+{
+	char *p = (char *)x;
+	for (size_t i = 0; i < n / 2; i++) {
+		fp::swap_(p[i], p[n - 1 - i]);
+	}
+}
+
+} // mcl::fp::local
+
 } // mcl::fp
 
 template<class tag = FpTag, size_t maxBitSize = MCL_MAX_BIT_SIZE>
@@ -89,6 +101,7 @@ class FpT : public fp::Serializable<FpT<tag, maxBitSize>,
 	static fp::Op op_;
 	static FpT<tag, maxBitSize> inv2_;
 	static int ioMode_;
+	static bool isETHserialization_;
 	template<class Fp> friend class FpDblT;
 	template<class Fp> friend class Fp2T;
 	template<class Fp> friend struct Fp6T;
@@ -131,6 +144,8 @@ class FpT : public fp::Serializable<FpT<tag, maxBitSize>,
 			if (!*pb) return;
 		}
 		inv(inv2_, 2);
+		ioMode_ = 0;
+		isETHserialization_ = false;
 #ifdef MCL_XBYAK_DIRECT_CALL
 		add = fp::func_ptr_cast<void (*)(FpT& z, const FpT& x, const FpT& y)>(op_.fp_addA_);
 		if (add == 0) add = addC;
@@ -253,6 +268,9 @@ class FpT : public fp::Serializable<FpT<tag, maxBitSize>,
 			} else {
 				readSize = cybozu::readSome(v_, n, is);
 			}
+			if (isETHserialization_ && ioMode & (IoSerialize | IoSerializeHexStr)) {
+				fp::local::byteSwap(v_, n);
+			}
 			if (readSize != n) return;
 		} else {
 			char buf[1024];
@@ -283,10 +301,18 @@ class FpT : public fp::Serializable<FpT<tag, maxBitSize>,
 			} else {
 				fp::Block b;
 				getBlock(b);
+				const char *src = (const char *)b.p;
+				char rev[fp::maxUnitSize * sizeof(fp::Unit)];
+				if (isETHserialization_ && ioMode & (IoSerialize | IoSerializeHexStr)) {
+					for (size_t i = 0; i < n; i++) {
+						rev[i] = src[n - 1 - i];
+					}
+					src = rev;
+				}
 				if (ioMode & IoSerializeHexStr) {
-					mcl::fp::writeHexStr(pb, os, b.p, n);
+					mcl::fp::writeHexStr(pb, os, src, n);
 				} else {
-					cybozu::write(pb, os, b.p, n);
+					cybozu::write(pb, os, src, n);
 				}
 			}
 			return;
@@ -302,10 +328,13 @@ class FpT : public fp::Serializable<FpT<tag, maxBitSize>,
 		}
 		cybozu::write(pb, os, buf + sizeof(buf) - len, len);
 	}
+	/*
+		mode = Mod : set x mod p if sizeof(S) * n <= 64 else error
+	*/
 	template<class S>
-	void setArray(bool *pb, const S *x, size_t n)
+	void setArray(bool *pb, const S *x, size_t n, mcl::fp::MaskMode mode = fp::NoMask)
 	{
-		*pb = fp::copyAndMask(v_, x, sizeof(S) * n, op_, fp::NoMask);
+		*pb = fp::copyAndMask(v_, x, sizeof(S) * n, op_, mode);
 		toMont();
 	}
 	/*
@@ -495,6 +524,12 @@ class FpT : public fp::Serializable<FpT<tag, maxBitSize>,
 	{
 		ioMode_ = ioMode;
 	}
+	static void setETHserialization(bool ETHserialization)
+	{
+		if (getBitSize() != 381) return;
+		isETHserialization_ = ETHserialization;
+	}
+	static inline bool isETHserialization() { return isETHserialization_; }
 	static inline int getIoMode() { return ioMode_; }
 	static inline size_t getModBitLen() { return getBitSize(); }
 	static inline void setHashFunc(uint32_t hash(void *out, uint32_t maxOutSize, const void *msg, uint32_t msgSize))
@@ -619,6 +654,7 @@ class FpT : public fp::Serializable<FpT<tag, maxBitSize>,
 template<class tag, size_t maxBitSize> fp::Op FpT<tag, maxBitSize>::op_;
 template<class tag, size_t maxBitSize> FpT<tag, maxBitSize> FpT<tag, maxBitSize>::inv2_;
 template<class tag, size_t maxBitSize> int FpT<tag, maxBitSize>::ioMode_ = IoAuto;
+template<class tag, size_t maxBitSize> bool FpT<tag, maxBitSize>::isETHserialization_ = false;
 #ifdef MCL_XBYAK_DIRECT_CALL
 template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::add)(FpT& z, const FpT& x, const FpT& y);
 template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::sub)(FpT& z, const FpT& x, const FpT& y);
diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index 96a6edb1a2..8d79a7ee29 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -283,9 +283,13 @@ class Fp2T : public fp::Serializable<Fp2T<_Fp>,
 	template<class InputStream>
 	void load(bool *pb, InputStream& is, int ioMode)
 	{
-		a.load(pb, is, ioMode);
+		Fp *ap = &a, *bp = &b;
+		if (Fp::isETHserialization_ && ioMode & (IoSerialize | IoSerializeHexStr)) {
+			fp::swap_(ap, bp);
+		}
+		ap->load(pb, is, ioMode);
 		if (!*pb) return;
-		b.load(pb, is, ioMode);
+		bp->load(pb, is, ioMode);
 	}
 	/*
 		Fp2T = <a> + ' ' + <b>
@@ -293,14 +297,18 @@ class Fp2T : public fp::Serializable<Fp2T<_Fp>,
 	template<class OutputStream>
 	void save(bool *pb, OutputStream& os, int ioMode) const
 	{
+		const Fp *ap = &a, *bp = &b;
+		if (Fp::isETHserialization_ && ioMode & (IoSerialize | IoSerializeHexStr)) {
+			fp::swap_(ap, bp);
+		}
 		const char sep = *fp::getIoSeparator(ioMode);
-		a.save(pb, os, ioMode);
+		ap->save(pb, os, ioMode);
 		if (!*pb) return;
 		if (sep) {
 			cybozu::writeChar(pb, os, sep);
 			if (!*pb) return;
 		}
-		b.save(pb, os, ioMode);
+		bp->save(pb, os, ioMode);
 	}
 	bool isZero() const { return a.isZero() && b.isZero(); }
 	bool isOne() const { return a.isOne() && b.isZero(); }
@@ -386,6 +394,7 @@ class Fp2T : public fp::Serializable<Fp2T<_Fp>,
 //		assert(Fp::maxSize <= 256);
 		mcl::fp::Op& op = Fp::op_;
 		assert(op.xi_a);
+		mul_xi = 0;
 #ifdef MCL_XBYAK_DIRECT_CALL
 		add = fp::func_ptr_cast<void (*)(Fp2T& z, const Fp2T& x, const Fp2T& y)>(op.fp2_addA_);
 		if (add == 0) add = addC;
diff --git a/include/mcl/gmp_util.hpp b/include/mcl/gmp_util.hpp
index a64bed4cae..117ecff779 100644
--- a/include/mcl/gmp_util.hpp
+++ b/include/mcl/gmp_util.hpp
@@ -10,7 +10,9 @@
 #include <stdlib.h>
 #include <assert.h>
 #include <stdint.h>
+#ifndef CYBOZU_DONT_USE_EXCEPTION
 #include <cybozu/exception.hpp>
+#endif
 #include <mcl/randgen.hpp>
 #ifdef _MSC_VER
 	#pragma warning(push)
@@ -24,6 +26,9 @@
 #if defined(__EMSCRIPTEN__) || defined(__wasm__)
 	#define MCL_USE_VINT
 #endif
+#ifndef MCL_MAX_BIT_SIZE
+	#define MCL_MAX_BIT_SIZE 521
+#endif
 #ifdef MCL_USE_VINT
 #include <mcl/vint.hpp>
 typedef mcl::Vint mpz_class;
@@ -859,4 +864,96 @@ class SquareRoot {
 #endif
 };
 
+/*
+	Barrett Reduction
+	for non GMP version
+	mod of GMP is faster than Modp
+*/
+struct Modp {
+	static const size_t unitBitSize = sizeof(mcl::fp::Unit) * 8;
+	mpz_class p_;
+	mpz_class u_;
+	mpz_class a_;
+	size_t pBitSize_;
+	size_t N_;
+	bool initU_; // Is u_ initialized?
+	Modp()
+		: pBitSize_(0)
+		, N_(0)
+		, initU_(false)
+	{
+	}
+	// x &= 1 << (unitBitSize * unitSize)
+	void shrinkSize(mpz_class &x, size_t unitSize) const
+	{
+		size_t u = gmp::getUnitSize(x);
+		if (u < unitSize) return;
+		bool b;
+		gmp::setArray(&b, x, gmp::getUnit(x), unitSize);
+		(void)b;
+		assert(b);
+	}
+	// p_ is set by p and compute (u_, a_) if possible
+	void init(const mpz_class& p)
+	{
+		p_ = p;
+		pBitSize_ = gmp::getBitSize(p);
+		N_ = (pBitSize_ + unitBitSize - 1) / unitBitSize;
+		initU_ = false;
+#if 0
+		u_ = (mpz_class(1) << (unitBitSize * 2 * N_)) / p_;
+#else
+		/*
+			1 << (unitBitSize * 2 * N_) may be overflow,
+			so use (1 << (unitBitSize * 2 * N_)) - 1 because u_ is same.
+		*/
+		uint8_t buf[48 * 2];
+		const size_t byteSize = unitBitSize / 8 * 2 * N_;
+		if (byteSize > sizeof(buf)) return;
+		memset(buf, 0xff, byteSize);
+		bool b;
+		gmp::setArray(&b, u_, buf, byteSize);
+		if (!b) return;
+#endif
+		u_ /= p_;
+		a_ = mpz_class(1) << (unitBitSize * (N_ + 1));
+		initU_ = true;
+	}
+	void modp(mpz_class& r, const mpz_class& t) const
+	{
+		assert(p_ > 0);
+		const size_t tBitSize = gmp::getBitSize(t);
+		// use gmp::mod if init() fails or t is too large
+		if (tBitSize > unitBitSize * 2 * N_ || !initU_) {
+			gmp::mod(r, t, p_);
+			return;
+		}
+		if (tBitSize < pBitSize_) {
+			r = t;
+			return;
+		}
+		// mod is faster than modp if t is small
+		if (tBitSize <= unitBitSize * N_) {
+			gmp::mod(r, t, p_);
+			return;
+		}
+		mpz_class q;
+		q = t;
+		q >>= unitBitSize * (N_ - 1);
+		q *= u_;
+		q >>= unitBitSize * (N_ + 1);
+		q *= p_;
+		shrinkSize(q, N_ + 1);
+		r = t;
+		shrinkSize(r, N_ + 1);
+		r -= q;
+		if (r < 0) {
+			r += a_;
+		}
+		if (r >= p_) {
+			r -= p_;
+		}
+	}
+};
+
 } // mcl
diff --git a/include/mcl/impl/bn_c_impl.hpp b/include/mcl/impl/bn_c_impl.hpp
index 7d92fe63fd..7c14f63766 100644
--- a/include/mcl/impl/bn_c_impl.hpp
+++ b/include/mcl/impl/bn_c_impl.hpp
@@ -17,6 +17,7 @@
 	#error "not supported size"
 #endif
 #include <mcl/lagrange.hpp>
+#include <mcl/ecparam.hpp>
 using namespace mcl::bn;
 
 static Fr *cast(mclBnFr *p) { return reinterpret_cast<Fr*>(p); }
@@ -34,6 +35,12 @@ static const Fp12 *cast(const mclBnGT *p) { return reinterpret_cast<const Fp12*>
 static Fp6 *cast(uint64_t *p) { return reinterpret_cast<Fp6*>(p); }
 static const Fp6 *cast(const uint64_t *p) { return reinterpret_cast<const Fp6*>(p); }
 
+static Fp2 *cast(mclBnFp2 *p) { return reinterpret_cast<Fp2*>(p); }
+static const Fp2 *cast(const mclBnFp2 *p) { return reinterpret_cast<const Fp2*>(p); }
+
+static Fp *cast(mclBnFp *p) { return reinterpret_cast<Fp*>(p); }
+static const Fp *cast(const mclBnFp *p) { return reinterpret_cast<const Fp*>(p); }
+
 template<class T>
 int setStr(T *x, const char *buf, mclSize bufSize, int ioMode)
 {
@@ -53,11 +60,23 @@ extern "C" MCLBN_DLL_API void mclBnFree(void *p)
 }
 #endif
 
+int mclBn_getVersion()
+{
+	return mcl::version;
+}
+
 int mclBn_init(int curve, int compiledTimeVar)
 {
 	if (compiledTimeVar != MCLBN_COMPILED_TIME_VAR) {
 		return -(compiledTimeVar | (MCLBN_COMPILED_TIME_VAR * 100));
 	}
+	if (MCL_EC_BEGIN <= curve && curve < MCL_EC_END) {
+		const mcl::EcParam *para = mcl::getEcParam(curve);
+		if (para == 0) return -2;
+		bool b;
+		initG1only(&b, *para);
+		return b ? 0 : -1;
+	}
 	const mcl::CurveParam& cp = mcl::getCurveParam(curve);
 	bool b;
 	initPairing(&b, cp);
@@ -71,7 +90,7 @@ int mclBn_getOpUnitSize()
 
 int mclBn_getG1ByteSize()
 {
-	return (int)Fp::getByteSize();
+	return mclBn_getFpByteSize();
 }
 
 int mclBn_getFrByteSize()
@@ -79,6 +98,11 @@ int mclBn_getFrByteSize()
 	return (int)Fr::getByteSize();
 }
 
+int mclBn_getFpByteSize()
+{
+	return (int)Fp::getByteSize();
+}
+
 mclSize mclBn_getCurveOrder(char *buf, mclSize maxBufSize)
 {
 	return Fr::getModulo(buf, maxBufSize);
@@ -89,6 +113,11 @@ mclSize mclBn_getFieldOrder(char *buf, mclSize maxBufSize)
 	return Fp::getModulo(buf, maxBufSize);
 }
 
+void mclBn_setETHserialization(int ETHserialization)
+{
+	Fp::setETHserialization(ETHserialization == 1);
+}
+
 ////////////////////////////////////////////////
 // set zero
 void mclBnFr_clear(mclBnFr *x)
@@ -115,6 +144,12 @@ int mclBnFr_setLittleEndian(mclBnFr *x, const void *buf, mclSize bufSize)
 	cast(x)->setArrayMask((const char *)buf, bufSize);
 	return 0;
 }
+int mclBnFr_setLittleEndianMod(mclBnFr *x, const void *buf, mclSize bufSize)
+{
+	bool b;
+	cast(x)->setArray(&b, (const char *)buf, bufSize, mcl::fp::Mod);
+	return b ? 0 : -1;
+}
 mclSize mclBnFr_deserialize(mclBnFr *x, const void *buf, mclSize bufSize)
 {
 	return (mclSize)cast(x)->deserialize(buf, bufSize);
@@ -525,3 +560,89 @@ void mclBn_verifyOrderG2(int doVerify)
 	verifyOrderG2(doVerify != 0);
 }
 
+mclSize mclBnFp_getStr(char *buf, mclSize maxBufSize, const mclBnFp *x, int ioMode)
+{
+	return cast(x)->getStr(buf, maxBufSize, ioMode);
+}
+int mclBnFp_setStr(mclBnFp *x, const char *buf, mclSize bufSize, int ioMode)
+{
+	return setStr(x, buf, bufSize, ioMode);
+}
+mclSize mclBnFp_deserialize(mclBnFp *x, const void *buf, mclSize bufSize)
+{
+	return (mclSize)cast(x)->deserialize(buf, bufSize);
+}
+
+mclSize mclBnFp_serialize(void *buf, mclSize maxBufSize, const mclBnFp *x)
+{
+	return (mclSize)cast(x)->serialize(buf, maxBufSize);
+}
+
+void mclBnFp_clear(mclBnFp *x)
+{
+	cast(x)->clear();
+}
+
+int mclBnFp_setLittleEndian(mclBnFp *x, const void *buf, mclSize bufSize)
+{
+	cast(x)->setArrayMask((const char *)buf, bufSize);
+	return 0;
+}
+
+int mclBnFp_setLittleEndianMod(mclBnFp *x, const void *buf, mclSize bufSize)
+{
+	bool b;
+	cast(x)->setArray(&b, (const char *)buf, bufSize, mcl::fp::Mod);
+	return b ? 0 : -1;
+}
+int mclBnFp_isEqual(const mclBnFp *x, const mclBnFp *y)
+{
+	return *cast(x) == *cast(y);
+}
+
+int mclBnFp_setHashOf(mclBnFp *x, const void *buf, mclSize bufSize)
+{
+	cast(x)->setHashOf(buf, bufSize);
+	return 0;
+}
+
+int mclBnFp_mapToG1(mclBnG1 *y, const mclBnFp *x)
+{
+	bool b;
+	mapToG1(&b, *cast(y), *cast(x));
+	return b ? 0 : -1;
+}
+
+mclSize mclBnFp2_deserialize(mclBnFp2 *x, const void *buf, mclSize bufSize)
+{
+	return (mclSize)cast(x)->deserialize(buf, bufSize);
+}
+
+mclSize mclBnFp2_serialize(void *buf, mclSize maxBufSize, const mclBnFp2 *x)
+{
+	return (mclSize)cast(x)->serialize(buf, maxBufSize);
+}
+
+void mclBnFp2_clear(mclBnFp2 *x)
+{
+	cast(x)->clear();
+}
+
+int mclBnFp2_isEqual(const mclBnFp2 *x, const mclBnFp2 *y)
+{
+	return *cast(x) == *cast(y);
+}
+
+int mclBnFp2_mapToG2(mclBnG2 *y, const mclBnFp2 *x)
+{
+	bool b;
+	mapToG2(&b, *cast(y), *cast(x));
+	return b ? 0 : -1;
+}
+
+int mclBnG1_getBasePoint(mclBnG1 *x)
+{
+	*cast(x) = mcl::bn::getG1basePoint();
+	return 0;
+}
+
diff --git a/include/mcl/lagrange.hpp b/include/mcl/lagrange.hpp
index 7c0218896a..18e0597ec1 100644
--- a/include/mcl/lagrange.hpp
+++ b/include/mcl/lagrange.hpp
@@ -15,14 +15,19 @@ namespace mcl {
 template<class G, class F>
 void LagrangeInterpolation(bool *pb, G& out, const F *S, const G *vec, size_t k)
 {
+	if (k == 0) {
+		*pb = false;
+		return;
+	}
+	if (k == 1) {
+		out = vec[0];
+		*pb = true;
+		return;
+	}
 	/*
 		delta_{i,S}(0) = prod_{j != i} S[j] / (S[j] - S[i]) = a / b
 		where a = prod S[j], b = S[i] * prod_{j != i} (S[j] - S[i])
 	*/
-	if (k < 2) {
-		*pb = false;
-		return;
-	}
 	F a = S[0];
 	for (size_t i = 1; i < k; i++) {
 		a *= S[i];
@@ -58,15 +63,20 @@ void LagrangeInterpolation(bool *pb, G& out, const F *S, const G *vec, size_t k)
 
 /*
 	out = f(x) = c[0] + c[1] * x + c[2] * x^2 + ... + c[cSize - 1] * x^(cSize - 1)
-	@retval 0 if succeed else -1
+	@retval 0 if succeed else -1 (if cSize == 0)
 */
 template<class G, class T>
 void evaluatePolynomial(bool *pb, G& out, const G *c, size_t cSize, const T& x)
 {
-	if (cSize < 2) {
+	if (cSize == 0) {
 		*pb = false;
 		return;
 	}
+	if (cSize == 1) {
+		out = c[0];
+		*pb = true;
+		return;
+	}
 	G y = c[cSize - 1];
 	for (int i = (int)cSize - 2; i >= 0; i--) {
 		G::mul(y, y, x);
diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp
index ee80c0b2ea..aad9aa7c6c 100644
--- a/include/mcl/op.hpp
+++ b/include/mcl/op.hpp
@@ -10,9 +10,6 @@
 #include <memory.h>
 #include <mcl/array.hpp>
 
-#ifndef MCL_MAX_BIT_SIZE
-	#define MCL_MAX_BIT_SIZE 521
-#endif
 #if defined(__EMSCRIPTEN__) || defined(__wasm__)
 	#define MCL_DONT_USE_XBYAK
 	#define MCL_DONT_USE_OPENSSL
@@ -26,6 +23,8 @@
 
 namespace mcl {
 
+static const int version = 0x094; /* 0xABC = A.BC */
+
 /*
 	specifies available string format mode for X::setIoMode()
 	// for Fp, Fp2, Fp6, Fp12
@@ -59,7 +58,7 @@ namespace mcl {
 	IoArray
 		array of Unit(fixed size = Fp::getByteSize())
 	IoArrayRaw
-		array of Unit(fixed size = Fp::getByteSize()) without Montgomery convresion
+		array of Unit(fixed size = Fp::getByteSize()) without Montgomery conversion
 
 	// for Ec::setIoMode()
 	IoEcAffine(default)
@@ -161,7 +160,8 @@ enum PrimeMode {
 enum MaskMode {
 	NoMask = 0, // throw if greater or equal
 	SmallMask = 1, // 1-bit smaller mask if greater or equal
-	MaskAndMod = 2 // mask and substract if greater or equal
+	MaskAndMod = 2, // mask and substract if greater or equal
+	Mod = 3 // mod p
 };
 
 struct Op {
@@ -174,6 +174,7 @@ struct Op {
 	mpz_class mp;
 	uint32_t pmod4;
 	mcl::SquareRoot sq;
+	mcl::Modp modp;
 	Unit half[maxUnitSize]; // (p + 1) / 2
 	Unit oneRep[maxUnitSize]; // 1(=inv R if Montgomery)
 	/*
@@ -328,12 +329,12 @@ struct Op {
 		fp2_mulNF = 0;
 		fp2_inv = 0;
 		fp2_mul_xiA_ = 0;
+		hash = 0;
 
 		primeMode = PM_GENERIC;
 		isFullBit = false;
 		isMont = false;
 		isFastMod = false;
-		hash = 0;
 	}
 	void fromMont(Unit* y, const Unit *x) const
 	{
diff --git a/include/mcl/she.h b/include/mcl/she.h
index 60b399c65c..d474216bf4 100644
--- a/include/mcl/she.h
+++ b/include/mcl/she.h
@@ -163,6 +163,19 @@ MCLSHE_DLL_API int shePrecomputedPublicKeyEncG1(sheCipherTextG1 *c, const shePre
 MCLSHE_DLL_API int shePrecomputedPublicKeyEncG2(sheCipherTextG2 *c, const shePrecomputedPublicKey *ppub, mclInt m);
 MCLSHE_DLL_API int shePrecomputedPublicKeyEncGT(sheCipherTextGT *c, const shePrecomputedPublicKey *ppub, mclInt m);
 
+/*
+	enc large integer
+	buf[bufSize] is little endian
+	bufSize <= (FrBitSize + 63) & ~63
+	return 0 if success
+*/
+MCLSHE_DLL_API int sheEncIntVecG1(sheCipherTextG1 *c, const shePublicKey *pub, const void *buf, mclSize bufSize);
+MCLSHE_DLL_API int sheEncIntVecG2(sheCipherTextG2 *c, const shePublicKey *pub, const void *buf, mclSize bufSize);
+MCLSHE_DLL_API int sheEncIntVecGT(sheCipherTextGT *c, const shePublicKey *pub, const void *buf, mclSize bufSize);
+MCLSHE_DLL_API int shePrecomputedPublicKeyEncIntVecG1(sheCipherTextG1 *c, const shePrecomputedPublicKey *ppub, const void *buf, mclSize bufSize);
+MCLSHE_DLL_API int shePrecomputedPublicKeyEncIntVecG2(sheCipherTextG2 *c, const shePrecomputedPublicKey *ppub, const void *buf, mclSize bufSize);
+MCLSHE_DLL_API int shePrecomputedPublicKeyEncIntVecGT(sheCipherTextGT *c, const shePrecomputedPublicKey *ppub, const void *buf, mclSize bufSize);
+
 /*
 	m must be 0 or 1
 */
@@ -235,6 +248,15 @@ MCLSHE_DLL_API int sheSubGT(sheCipherTextGT *z, const sheCipherTextGT *x, const
 MCLSHE_DLL_API int sheMulG1(sheCipherTextG1 *z, const sheCipherTextG1 *x, mclInt y);
 MCLSHE_DLL_API int sheMulG2(sheCipherTextG2 *z, const sheCipherTextG2 *x, mclInt y);
 MCLSHE_DLL_API int sheMulGT(sheCipherTextGT *z, const sheCipherTextGT *x, mclInt y);
+/*
+	mul large integer
+	buf[bufSize] is little endian
+	bufSize <= (FrBitSize + 63) & ~63
+	return 0 if success
+*/
+MCLSHE_DLL_API int sheMulIntVecG1(sheCipherTextG1 *z, const sheCipherTextG1 *x, const void *buf, mclSize bufSize);
+MCLSHE_DLL_API int sheMulIntVecG2(sheCipherTextG2 *z, const sheCipherTextG2 *x, const void *buf, mclSize bufSize);
+MCLSHE_DLL_API int sheMulIntVecGT(sheCipherTextGT *z, const sheCipherTextGT *x, const void *buf, mclSize bufSize);
 
 // return 0 if success
 // z = x * y
diff --git a/include/mcl/she.hpp b/include/mcl/she.hpp
index 3ce361454c..282f2fe837 100644
--- a/include/mcl/she.hpp
+++ b/include/mcl/she.hpp
@@ -170,9 +170,9 @@ class HashTable {
 		find range which has same hash of xP in kcv_,
 		and detect it
 	*/
-	int basicLog(G xP, bool *ok = 0) const
+	int basicLog(G xP, bool *pok = 0) const
 	{
-		if (ok) *ok = true;
+		if (pok) *pok = true;
 		if (I::isZero(xP)) return 0;
 		typedef KeyCountVec::const_iterator Iter;
 		KeyCount kc;
@@ -205,8 +205,8 @@ class HashTable {
 			prev = abs_c;
 			++p.first;
 		}
-		if (ok) {
-			*ok = false;
+		if (pok) {
+			*pok = false;
 			return 0;
 		}
 		throw cybozu::Exception("HashTable:basicLog:not found");
@@ -215,11 +215,12 @@ class HashTable {
 		compute log_P(xP)
 		call basicLog at most 2 * tryNum
 	*/
-	int64_t log(const G& xP) const
+	int64_t log(const G& xP, bool *pok = 0) const
 	{
 		bool ok;
 		int c = basicLog(xP, &ok);
 		if (ok) {
+			if (pok) *pok = true;
 			return c;
 		}
 		G posP = xP, negP = xP;
@@ -231,15 +232,21 @@ class HashTable {
 			posCenter += next;
 			c = basicLog(posP, &ok);
 			if (ok) {
+				if (pok) *pok = true;
 				return posCenter + c;
 			}
 			I::add(negP, negP, nextP_);
 			negCenter -= next;
 			c = basicLog(negP, &ok);
 			if (ok) {
+				if (pok) *pok = true;
 				return negCenter + c;
 			}
 		}
+		if (pok) {
+			*pok = false;
+			return 0;
+		}
 		throw cybozu::Exception("HashTable:log:not found");
 	}
 	/*
@@ -683,7 +690,7 @@ struct SHET {
 			throw cybozu::Exception("she:dec:log:not found");
 		}
 #endif
-		int64_t dec(const CipherTextG1& c) const
+		int64_t dec(const CipherTextG1& c, bool *pok = 0) const
 		{
 			if (useDecG1ViaGT_) return decViaGT(c);
 			/*
@@ -694,51 +701,51 @@ struct SHET {
 			G1 R;
 			G1::mul(R, c.T_, x_);
 			G1::sub(R, c.S_, R);
-			return PhashTbl_.log(R);
+			return PhashTbl_.log(R, pok);
 		}
-		int64_t dec(const CipherTextG2& c) const
+		int64_t dec(const CipherTextG2& c, bool *pok = 0) const
 		{
 			if (useDecG2ViaGT_) return decViaGT(c);
 			G2 R;
 			G2::mul(R, c.T_, y_);
 			G2::sub(R, c.S_, R);
-			return QhashTbl_.log(R);
+			return QhashTbl_.log(R, pok);
 		}
-		int64_t dec(const CipherTextA& c) const
+		int64_t dec(const CipherTextA& c, bool *pok = 0) const
 		{
-			return dec(c.c1_);
+			return dec(c.c1_, pok);
 		}
-		int64_t dec(const CipherTextGT& c) const
+		int64_t dec(const CipherTextGT& c, bool *pok = 0) const
 		{
 			GT v;
 			getPowOfePQ(v, c);
-			return ePQhashTbl_.log(v);
+			return ePQhashTbl_.log(v, pok);
 //			return log(g, v);
 		}
-		int64_t decViaGT(const CipherTextG1& c) const
+		int64_t decViaGT(const CipherTextG1& c, bool *pok = 0) const
 		{
 			G1 R;
 			G1::mul(R, c.T_, x_);
 			G1::sub(R, c.S_, R);
 			GT v;
 			pairing(v, R, Q_);
-			return ePQhashTbl_.log(v);
+			return ePQhashTbl_.log(v, pok);
 		}
-		int64_t decViaGT(const CipherTextG2& c) const
+		int64_t decViaGT(const CipherTextG2& c, bool *pok = 0) const
 		{
 			G2 R;
 			G2::mul(R, c.T_, y_);
 			G2::sub(R, c.S_, R);
 			GT v;
 			pairing(v, P_, R);
-			return ePQhashTbl_.log(v);
+			return ePQhashTbl_.log(v, pok);
 		}
-		int64_t dec(const CipherText& c) const
+		int64_t dec(const CipherText& c, bool *pok = 0) const
 		{
 			if (c.isMultiplied()) {
-				return dec(c.m_);
+				return dec(c.m_, pok);
 			} else {
-				return dec(c.a_);
+				return dec(c.a_, pok);
 			}
 		}
 		bool isZero(const CipherTextG1& c) const
@@ -1045,7 +1052,7 @@ struct SHET {
 		G2 R5, R6;
 		ElGamalEnc(R4, R3, rm, Pmul, xPmul, &rp);
 		ElGamalEnc(R6, R5, rm, Qmul, yQmul, &rs);
-		char buf[sizeof(Fr) * 12];
+		char buf[sizeof(Fp) * 12];
 		cybozu::MemoryOutputStream os(buf, sizeof(buf));
 		S1.save(os);
 		T1.save(os);
@@ -1105,7 +1112,7 @@ struct SHET {
 		R5 -= X2;
 		G2::mul(X2, S2, c);
 		R6 -= X2;
-		char buf[sizeof(Fr) * 12];
+		char buf[sizeof(Fp) * 12];
 		cybozu::MemoryOutputStream os(buf, sizeof(buf));
 		S1.save(os);
 		T1.save(os);
@@ -1325,7 +1332,7 @@ struct SHET {
 
 			G1 P1, P2;
 			G1::mul(P1, xP_, ra);
-			if (m) {
+			if (m != 0) {
 //				G1::mul(P2, P, m);
 				PhashTbl_.mulByWindowMethod(P2, m);
 				P1 += P2;
@@ -1656,7 +1663,8 @@ struct SHET {
 		{
 			mul(z, x.c1_, y.c2_);
 		}
-		static void mul(CipherTextGT& z, const CipherTextGT& x, int64_t y)
+		template<class INT>
+		static void mul(CipherTextGT& z, const CipherTextGT& x, const INT& y)
 		{
 			for (int i = 0; i < 4; i++) {
 				GT::pow(z.g_[i], x.g_[i], y);
diff --git a/include/mcl/util.hpp b/include/mcl/util.hpp
index edef971cb3..a406241768 100644
--- a/include/mcl/util.hpp
+++ b/include/mcl/util.hpp
@@ -16,6 +16,7 @@
 
 namespace mcl { namespace fp {
 
+// some environments do not have utility
 template<class T>
 T abs_(T x) { return x < 0 ? -x : x; }
 
@@ -34,7 +35,6 @@ void swap_(T& x, T& y)
 	y = t;
 }
 
-
 /*
 	get pp such that p * pp = -1 mod M,
 	where p is prime and M = 1 << 64(or 32).
diff --git a/include/mcl/vint.hpp b/include/mcl/vint.hpp
index c2a7a10bff..bf324f4d26 100644
--- a/include/mcl/vint.hpp
+++ b/include/mcl/vint.hpp
@@ -2,7 +2,9 @@
 /**
 	emulate mpz_class
 */
+#ifndef CYBOZU_DONT_USE_EXCEPTION
 #include <cybozu/exception.hpp>
+#endif
 #include <cybozu/bit_operation.hpp>
 #include <cybozu/xorshift.hpp>
 #include <assert.h>
@@ -19,7 +21,7 @@
 	#define MCL_VINT_FIXED_BUFFER
 #endif
 #ifndef MCL_MAX_BIT_SIZE
-	#define MCL_MAX_BIT_SIZE 384
+	#error "define MCL_MAX_BIT_SZIE"
 #endif
 
 #ifndef MCL_SIZEOF_UNIT
@@ -40,6 +42,13 @@ typedef uint64_t Unit;
 typedef uint32_t Unit;
 #endif
 
+template<size_t x>
+struct RoundUp {
+	static const size_t UnitBitSize = sizeof(Unit) * 8;
+	static const size_t N = (x + UnitBitSize - 1) / UnitBitSize;
+	static const size_t bit = N * UnitBitSize;
+};
+
 template<class T>
 void dump(const T *x, size_t n, const char *msg = "")
 {
@@ -568,6 +577,7 @@ void divNM(T *q, size_t qn, T *r, const T *x, size_t xn, const T *y, size_t yn)
 	yn = getRealSize(y, yn);
 	if (x == y) {
 		assert(xn == yn);
+	x_is_y:
 		clearN(r, rn);
 		if (q) {
 			q[0] = 1;
@@ -579,6 +589,7 @@ void divNM(T *q, size_t qn, T *r, const T *x, size_t xn, const T *y, size_t yn)
 		/*
 			if y > x then q = 0 and r = x
 		*/
+	q_is_zero:
 		copyN(r, x, xn);
 		clearN(r + xn, rn - xn);
 		if (q) clearN(q, qn);
@@ -598,11 +609,61 @@ void divNM(T *q, size_t qn, T *r, const T *x, size_t xn, const T *y, size_t yn)
 		clearN(r + 1, rn - 1);
 		return;
 	}
+	const size_t yTopBit = cybozu::bsr(y[yn - 1]);
 	assert(yn >= 2);
+	if (xn == yn) {
+		const size_t xTopBit = cybozu::bsr(x[xn - 1]);
+		if (xTopBit < yTopBit) goto q_is_zero;
+		if (yTopBit == xTopBit) {
+			int ret = compareNM(x, xn, y, yn);
+			if (ret == 0) goto x_is_y;
+			if (ret < 0) goto q_is_zero;
+			if (r) {
+				subN(r, x, y, yn);
+			}
+			if (q) {
+				q[0] = 1;
+				clearN(q + 1, qn - 1);
+			}
+			return;
+		}
+		assert(xTopBit > yTopBit);
+		// fast reduction for larger than fullbit-3 size p
+		if (yTopBit >= sizeof(T) * 8 - 4) {
+			T *xx = (T*)CYBOZU_ALLOCA(sizeof(T) * xn);
+			T qv = 0;
+			if (yTopBit == sizeof(T) * 8 - 2) {
+				copyN(xx, x, xn);
+			} else {
+				qv = x[xn - 1] >> (yTopBit + 1);
+				mulu1(xx, y, yn, qv);
+				subN(xx, x, xx, xn);
+				xn = getRealSize(xx, xn);
+			}
+			for (;;) {
+				T ret = subN(xx, xx, y, yn);
+				if (ret) {
+					addN(xx, xx, y, yn);
+					break;
+				}
+				qv++;
+				xn = getRealSize(xx, xn);
+			}
+			if (r) {
+				copyN(r, xx, xn);
+				clearN(r + xn, rn - xn);
+			}
+			if (q) {
+				q[0] = qv;
+				clearN(q + 1, qn - 1);
+			}
+			return;
+		}
+	}
 	/*
 		bitwise left shift x and y to adjust MSB of y[yn - 1] = 1
 	*/
-	const size_t shift = sizeof(T) * 8 - 1 - cybozu::bsr(y[yn - 1]);
+	const size_t shift = sizeof(T) * 8 - 1 - yTopBit;
 	T *xx = (T*)CYBOZU_ALLOCA(sizeof(T) * (xn + 1));
 	const T *yy;
 	if (shift) {
@@ -753,9 +814,16 @@ class FixedBuffer {
 	FixedBuffer& operator=(const FixedBuffer& rhs)
 	{
 		size_ = rhs.size_;
+#if defined(__GNUC__) && !defined(__EMSCRIPTEN__) && !defined(__clang__)
+	#pragma GCC diagnostic push
+	#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
 		for (size_t i = 0; i < size_; i++) {
 			v_[i] = rhs.v_[i];
 		}
+#if defined(__GNUC__) && !defined(__EMSCRIPTEN__) && !defined(__clang__)
+	#pragma GCC diagnostic pop
+#endif
 		return *this;
 	}
 	void clear() { size_ = 0; }
@@ -892,7 +960,11 @@ class VintT {
 		size_t zn = fp::max_(xn, yn) + 1;
 		bool b;
 		z.buf_.alloc(&b, zn);
-		assert(b); (void)b;
+		assert(b);
+		if (!b) {
+			z.clear();
+			return;
+		}
 		z.buf_[zn - 1] = vint::addNM(&z.buf_[0], &x[0], xn, &y[0], yn);
 		z.trim(zn);
 	}
@@ -901,7 +973,11 @@ class VintT {
 		size_t zn = xn + 1;
 		bool b;
 		z.buf_.alloc(&b, zn);
-		assert(b); (void)b;
+		assert(b);
+		if (!b) {
+			z.clear();
+			return;
+		}
 		z.buf_[zn - 1] = vint::addu1(&z.buf_[0], &x[0], xn, y);
 		z.trim(zn);
 	}
@@ -910,7 +986,11 @@ class VintT {
 		size_t zn = xn;
 		bool b;
 		z.buf_.alloc(&b, zn);
-		assert(b); (void)b;
+		assert(b);
+		if (!b) {
+			z.clear();
+			return;
+		}
 		Unit c = vint::subu1(&z.buf_[0], &x[0], xn, y);
 		(void)c;
 		assert(!c);
@@ -921,7 +1001,11 @@ class VintT {
 		assert(xn >= yn);
 		bool b;
 		z.buf_.alloc(&b, xn);
-		assert(b); (void)b;
+		assert(b);
+		if (!b) {
+			z.clear();
+			return;
+		}
 		Unit c = vint::subN(&z.buf_[0], &x[0], &y[0], yn);
 		if (xn > yn) {
 			c = vint::subu1(&z.buf_[yn], &x[yn], xn - yn, c);
@@ -996,10 +1080,20 @@ class VintT {
 		bool b;
 		if (q) {
 			q->buf_.alloc(&b, qn);
-			assert(b); (void)b;
+			assert(b);
+			if (!b) {
+				q->clear();
+				r.clear();
+				return;
+			}
 		}
 		r.buf_.alloc(&b, yn);
-		assert(b); (void)b;
+		assert(b);
+		if (!b) {
+			r.clear();
+			if (q) q->clear();
+			return;
+		}
 		vint::divNM(q ? &q->buf_[0] : 0, qn, &r.buf_[0], &x[0], xn, &y[0], yn);
 		if (q) {
 			q->trim(qn);
@@ -1124,8 +1218,15 @@ class VintT {
 		size_t unitSize = (sizeof(S) * size + sizeof(Unit) - 1) / sizeof(Unit);
 		buf_.alloc(pb, unitSize);
 		if (!*pb) return;
-		buf_[unitSize - 1] = 0;
-		memcpy(&buf_[0], x, sizeof(S) * size);
+		char *dst = (char *)&buf_[0];
+		const char *src = (const char *)x;
+		size_t i = 0;
+		for (; i < sizeof(S) * size; i++) {
+			dst[i] = src[i];
+		}
+		for (; i < sizeof(Unit) * unitSize; i++) {
+			dst[i] = 0;
+		}
 		trim(unitSize);
 	}
 	/*
@@ -1215,7 +1316,11 @@ class VintT {
 		assert(q <= size());
 		bool b;
 		buf_.alloc(&b, q + 1);
-		assert(b); (void)b;
+		assert(b);
+		if (!b) {
+			clear();
+			return;
+		}
 		Unit mask = Unit(1) << r;
 		if (v) {
 			buf_[q] |= mask;
@@ -1232,7 +1337,8 @@ class VintT {
 	*/
 	void setStr(bool *pb, const char *str, int base = 0)
 	{
-		const size_t maxN = MCL_MAX_BIT_SIZE / (sizeof(MCL_SIZEOF_UNIT) * 8);
+		// allow twice size of MCL_MAX_BIT_SIZE because of multiplication
+		const size_t maxN = (MCL_MAX_BIT_SIZE * 2 + unitBitSize - 1) / unitBitSize;
 		buf_.alloc(pb, maxN);
 		if (!*pb) return;
 		*pb = false;
@@ -1303,7 +1409,11 @@ class VintT {
 		size_t zn = xn + yn;
 		bool b;
 		z.buf_.alloc(&b, zn);
-		assert(b); (void)b;
+		assert(b);
+		if (!b) {
+			z.clear();
+			return;
+		}
 		vint::mulNM(&z.buf_[0], &x.buf_[0], xn, &y.buf_[0], yn);
 		z.isNeg_ = x.isNeg_ ^ y.isNeg_;
 		z.trim(zn);
@@ -1326,7 +1436,11 @@ class VintT {
 		size_t zn = xn + 1;
 		bool b;
 		z.buf_.alloc(&b, zn);
-		assert(b); (void)b;
+		assert(b);
+		if (!b) {
+			z.clear();
+			return;
+		}
 		z.buf_[zn - 1] = vint::mulu1(&z.buf_[0], &x.buf_[0], xn, y);
 		z.isNeg_ = x.isNeg_;
 		z.trim(zn);
@@ -1375,7 +1489,11 @@ class VintT {
 			q->isNeg_ = xNeg ^ yNeg;
 			bool b;
 			q->buf_.alloc(&b, xn);
-			assert(b); (void)b;
+			assert(b);
+			if (!b) {
+				q->clear();
+				return 0;
+			}
 			r = (int)vint::divu1(&q->buf_[0], &x.buf_[0], xn, absY);
 			q->trim(xn);
 		} else {
@@ -1423,7 +1541,11 @@ class VintT {
 		if (q) {
 			bool b;
 			q->buf_.alloc(&b, xn);
-			assert(b); (void)b;
+			assert(b);
+			if (!b) {
+				q->clear();
+				return 0;
+			}
 		}
 		Unit r = vint::divu1(q ? &q->buf_[0] : 0, &x.buf_[0], xn, y);
 		if (q) {
@@ -1476,7 +1598,11 @@ class VintT {
 		size_t yn = xn + (shiftBit + unitBitSize - 1) / unitBitSize;
 		bool b;
 		y.buf_.alloc(&b, yn);
-		assert(b); (void)b;
+		assert(b);
+		if (!b) {
+			y.clear();
+			return;
+		}
 		vint::shlN(&y.buf_[0], &x.buf_[0], xn, shiftBit);
 		y.isNeg_ = x.isNeg_;
 		y.trim(yn);
@@ -1492,7 +1618,11 @@ class VintT {
 		size_t yn = xn - shiftBit / unitBitSize;
 		bool b;
 		y.buf_.alloc(&b, yn);
-		assert(b); (void)b;
+		assert(b);
+		if (!b) {
+			y.clear();
+			return;
+		}
 		vint::shrN(&y.buf_[0], &x.buf_[0], xn, shiftBit);
 		y.isNeg_ = x.isNeg_;
 		y.trim(yn);
@@ -1526,7 +1656,10 @@ class VintT {
 		assert(xn >= yn);
 		bool b;
 		z.buf_.alloc(&b, xn);
-		assert(b); (void)b;
+		assert(b);
+		if (!b) {
+			z.clear();
+		}
 		for (size_t i = 0; i < yn; i++) {
 			z.buf_[i] = x.buf_[i] | y.buf_[i];
 		}
@@ -1544,7 +1677,11 @@ class VintT {
 		assert(px->size() >= yn);
 		bool b;
 		z.buf_.alloc(&b, yn);
-		assert(b); (void)b;
+		assert(b);
+		if (!b) {
+			z.clear();
+			return;
+		}
 		for (size_t i = 0; i < yn; i++) {
 			z.buf_[i] = x.buf_[i] & y.buf_[i];
 		}
@@ -1917,7 +2054,7 @@ class VintT {
 };
 
 #ifdef MCL_VINT_FIXED_BUFFER
-typedef VintT<vint::FixedBuffer<mcl::vint::Unit, MCL_MAX_BIT_SIZE * 2> > Vint;
+typedef VintT<vint::FixedBuffer<mcl::vint::Unit, vint::RoundUp<MCL_MAX_BIT_SIZE>::bit * 2> > Vint;
 #else
 typedef VintT<vint::Buffer<mcl::vint::Unit> > Vint;
 #endif
diff --git a/misc/bench.txt b/misc/bench.txt
new file mode 100644
index 0000000000..3e18e6b440
--- /dev/null
+++ b/misc/bench.txt
@@ -0,0 +1,21 @@
+Core i7-7700 @ 3.6GHz
+                    BN254      BLS12_381
+G1::mul        185.863Kclk   360.723Kclk
+G1::add        812.01 clk      1.540Kclk
+G1::dbl        837.24 clk      1.977Kclk
+G2::mul        340.125Kclk   642.457Kclk
+G2::add          2.233Kclk     4.368Kclk
+G2::dbl          2.134Kclk     4.088Kclk
+GT::pow        615.052Kclk     1.055Mclk
+G1::setStr chk   1.546Kclk   534.376Kclk
+G1::setStr       1.592Kclk     4.000Kclk
+G2::setStr chk 609.195Kclk     1.402Mclk
+G2::setStr       5.444Kclk     8.282Kclk
+hashAndMapToG1  26.997Kclk   336.207Kclk
+hashAndMapToG2 212.800Kclk   775.072Kclk
+pairing        909.076Kclk     2.367Mclk
+millerLoop     549.957Kclk   983.935Kclk
+finalExp       375.203Kclk     1.404Mclk
+precomputeG2   126.000Kclk   236.912Kclk
+precomputedML  427.272Kclk   729.234Kclk
+
diff --git a/mklib.bat b/mklib.bat
index b601f15d23..aef1494465 100644
--- a/mklib.bat
+++ b/mklib.bat
@@ -23,6 +23,16 @@ if "%1"=="dll" (
      cl /c %CFLAGS% src\bn_c384.cpp /Foobj\bn_c384.obj /DMCLBN_NO_AUTOLINK
   echo link /nologo /DLL /OUT:bin\mclbn384.dll obj\bn_c384.obj obj\fp.obj %LDFLAGS% /implib:lib\mclbn384.lib
      link /nologo /DLL /OUT:bin\mclbn384.dll obj\bn_c384.obj obj\fp.obj %LDFLAGS% /implib:lib\mclbn384.lib
+
+  echo cl /c %CFLAGS% src\she_c256.cpp /Foobj\she_c256.obj /DMCLBN_NO_AUTOLINK
+     cl /c %CFLAGS% src\she_c256.cpp /Foobj\she_c256.obj /DMCLBN_NO_AUTOLINK
+  echo link /nologo /DLL /OUT:bin\mclshe256.dll obj\she_c256.obj obj\fp.obj %LDFLAGS% /implib:lib\mclshe_c256.lib
+     link /nologo /DLL /OUT:bin\mclshe256.dll obj\she_c256.obj obj\fp.obj %LDFLAGS% /implib:lib\mclshe_c256.lib
+
+  echo cl /c %CFLAGS% src\she_c384_256.cpp /Foobj\she_c384_256.obj /DMCLBN_NO_AUTOLINK
+     cl /c %CFLAGS% src\she_c384_256.cpp /Foobj\she_c384_256.obj /DMCLBN_NO_AUTOLINK
+  echo link /nologo /DLL /OUT:bin\mclshe384_256.dll obj\she_c384_256.obj obj\fp.obj %LDFLAGS% /implib:lib\mclshe_c384_256.lib
+     link /nologo /DLL /OUT:bin\mclshe384_256.dll obj\she_c384_256.obj obj\fp.obj %LDFLAGS% /implib:lib\mclshe_c384_256.lib
 ) else (
   echo cl /c %CFLAGS% src\bn_c256.cpp /Foobj\bn_c256.obj
      cl /c %CFLAGS% src\bn_c256.cpp /Foobj\bn_c256.obj
diff --git a/readme.md b/readme.md
index 15d8bef096..2bd603c438 100644
--- a/readme.md
+++ b/readme.md
@@ -10,6 +10,7 @@ mcl is a library for pairing-based cryptography.
 The current version supports the optimal Ate pairing over BN curves and BLS12-381 curves.
 
 # News
+* mclBn_setETHserialization(true) (de)serialize acoording to [ETH2.0 serialization of BLS12-381](https://github.com/ethereum/eth2.0-specs/blob/dev/specs/bls_signature.md#point-representations) when BLS12-381 is used.
 * (Break backward compatibility) libmcl_dy.a is renamed to libmcl.a
     * The option SHARE_BASENAME_SUF is removed
 * 2nd argument of `mclBn_init` is changed from `maxUnitSize` to `compiledTimeVar`, which must be `MCLBN_COMPILED_TIME_VAR`.
@@ -301,6 +302,19 @@ The field Fp12 is constructed via the following tower:
 * Fp12 = Fp6[w] / (w^2 - v)
 * GT = { x in Fp12 | x^r = 1 }
 
+## Curve Parameter
+r = |G1| = |G2| = |GT|
+
+curveType   | hexadecimal number|
+------------|-------------------|
+BN254 r     | 2523648240000001ba344d8000000007ff9f800000000010a10000000000000d |
+BN254 p     | 2523648240000001ba344d80000000086121000000000013a700000000000013 |
+BN381 r     | 240026400f3d82b2e42de125b00158405b710818ac000007e0042f008e3e00000000001080046200000000000000000d |
+BN381 p     | 240026400f3d82b2e42de125b00158405b710818ac00000840046200950400000000001380052e000000000000000013 |
+BN462 r     | 240480360120023ffffffffff6ff0cf6b7d9bfca0000000000d812908ee1c201f7fffffffff6ff66fc7bf717f7c0000000002401b007e010800d |
+BN462 r     | 240480360120023ffffffffff6ff0cf6b7d9bfca0000000000d812908f41c8020ffffffffff6ff66fc6ff687f640000000002401b00840138013 |
+BLS12-381 r | 73eda753299d7d483339d80809a1d80553bda402fffe5bfeffffffff00000001 |
+BLS12-381 r | 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab |
 
 ## Arithmetic operations
 
@@ -352,6 +366,44 @@ getStr() method gets
 * `2 <x>` ; compressed format for even y
 * `3 <x>` ; compressed format for odd y
 
+## Generator of G1 and G2
+
+If you want to use the same generators of BLS12-381 with [zkcrypto](https://github.com/zkcrypto/pairing/tree/master/src/bls12_381#g2) then,
+
+```
+// G1 P
+P.setStr('1 3685416753713387016781088315183077757961620795782546409894578378688607592378376318836054947676345821548104185464507 1339506544944476473020471379941921221584933875938349620426543736416511423956333506472724655353366534992391756441569')
+
+// G2 Q
+Q.setStr('1 352701069587466618187139116011060144890029952792775240219908644239793785735715026873347600343865175952761926303160 3059144344244213709971259814753781636986470325476647558659373206291635324768958432433509563104347017837885763365758 1985150602287291935568054521177171638300868978215655730859378665066344726373823718423869104263333984641494340347905 927553665492332455747201965776037880757740193453592970025027978793976877002675564980949289727957565575433344219582')
+```
+
+## Serialization format of G1 and G2
+
+pseudo-code to serialize of p
+```
+if bit-length(p) % 8 != 0:
+  size = Fp::getByteSize()
+  if p is zero:
+    return [0] * size
+  else:
+    s = x.serialize()
+    # x in Fp2 is odd <=> x.a is odd
+    if y is odd:
+      s[byte-length(s) - 1] |= 0x80
+    return s
+else:
+  size = Fp::getByteSize() + 1
+  if p is zero:
+    return [0] * size
+  else:
+    s = x.serialize()
+    if y is odd:
+      return 2:s
+    else:
+      return 3:s
+```
+
 ## Verify an element in G2
 `G2::isValid()` checks that the element is in the curve of G2 and the order of it is r for subgroup attack.
 `G2::set()`, `G2::setStr` and `operator<<` also check the order.
@@ -403,6 +455,19 @@ This library contains some part of the followings software licensed by BSD-3-Cla
 * [_Skew Frobenius Map and Efficient Scalar Multiplication for Pairing–Based Cryptography_](https://www.researchgate.net/publication/221282560_Skew_Frobenius_Map_and_Efficient_Scalar_Multiplication_for_Pairing-Based_Cryptography),
 Y. Sakemi, Y. Nogami, K. Okeya, Y. Morikawa, CANS 2008.
 
+# History
+
+* 2019/Apr/29 v0.94 mclBn_setETHserialization supports [ETH2.0 serialization of BLS12-381](https://github.com/ethereum/eth2.0-specs/blob/dev/specs/bls_signature.md#point-representations)
+* 2019/Apr/24 v0.93 support ios
+* 2019/Mar/22 v0.92 shortcut for Ec::mul(Px, P, x) if P = 0
+* 2019/Mar/21 python binding of she256 for Linux/Mac/Windows
+* 2019/Mar/14 v0.91 modp supports mcl-wasm
+* 2019/Mar/12 v0.90 fix Vint::setArray(x) for x == this
+* 2019/Mar/07 add mclBnFr_setLittleEndianMod, mclBnFp_setLittleEndianMod
+* 2019/Feb/20 LagrangeInterpolation sets out = yVec[0] if k = 1
+* 2019/Jan/31 add mclBnFp_mapToG1, mclBnFp2_mapToG2
+* 2019/Jan/31 fix crash on x64-CPU without AVX (thanks to mortdeus)
+
 # Author
 
 光成滋生 MITSUNARI Shigeo(herumi@nifty.com)
diff --git a/sample/pairing_c.c b/sample/pairing_c.c
index 5c2cd222ab..ac559087e1 100644
--- a/sample/pairing_c.c
+++ b/sample/pairing_c.c
@@ -11,7 +11,11 @@ int main()
 	char buf[1024];
 	const char *aStr = "123";
 	const char *bStr = "456";
-	mclBn_init(MCL_BN254, MCLBN_FP_UNIT_SIZE);
+	int ret = mclBn_init(MCL_BN254, MCLBN_COMPILED_TIME_VAR);
+	if (ret != 0) {
+		printf("err ret=%d\n", ret);
+		return 1;
+	}
 	mclBnFr a, b, ab;
 	mclBnG1 P, aP;
 	mclBnG2 Q, bQ;
diff --git a/sample/rawbench.cpp b/sample/rawbench.cpp
index 4a596c4610..4d7506ef58 100644
--- a/sample/rawbench.cpp
+++ b/sample/rawbench.cpp
@@ -25,8 +25,8 @@ void mul9(const mcl::fp::Op& op, Unit *y, const Unit *x, const Unit *p)
 
 void benchRaw(const char *p, mcl::fp::Mode mode)
 {
-	Fp::init(p, mode);
-	Fp2::init(1);
+	Fp::init(1, p, mode);
+	Fp2::init();
 	const size_t maxN = sizeof(Fp) / sizeof(Unit);
 	const mcl::fp::Op& op = Fp::getOp();
 	cybozu::XorShift rg;
diff --git a/src/fp.cpp b/src/fp.cpp
index 41e98e4d24..08e7388240 100644
--- a/src/fp.cpp
+++ b/src/fp.cpp
@@ -251,7 +251,7 @@ void setOp(Op& op, Mode mode)
 	setOp2<N, Gtag, true, false>(op);
 #ifdef MCL_USE_LLVM
 	if (mode != fp::FP_GMP && mode != fp::FP_GMP_MONT) {
-#if defined(MCL_USE_XBYAK) && CYBOZU_HOST == CYBOZU_HOST_INTEL
+#if MCL_LLVM_BMI2 == 1
 		const bool gmpIsFasterThanLLVM = false;//(N == 8 && MCL_SIZEOF_UNIT == 8);
 		Xbyak::util::Cpu cpu;
 		if (cpu.has(Xbyak::util::Cpu::tBMI2)) {
@@ -318,9 +318,9 @@ static bool initForMont(Op& op, const Unit *p, Mode mode)
 	if (mode != FP_XBYAK) return true;
 #ifdef MCL_USE_XBYAK
 	if (op.fg == 0) op.fg = Op::createFpGenerator();
-	op.fg->init(op);
+	bool useXbyak = op.fg->init(op);
 
-	if (op.isMont && N <= 4) {
+	if (useXbyak && op.isMont && N <= 4) {
 		op.fp_invOp = &invOpForMontC;
 		initInvTbl(op);
 	}
@@ -386,21 +386,31 @@ bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size
 	isFullBit = (bitSize % UnitBitSize) == 0;
 
 #if defined(MCL_USE_LLVM) || defined(MCL_USE_XBYAK)
-	if ((mode == FP_AUTO || mode == FP_LLVM || mode == FP_XBYAK)
-		&& mp == mpz_class("0xfffffffffffffffffffffffffffffffeffffffffffffffff")) {
-		primeMode = PM_NIST_P192;
-		isMont = false;
-		isFastMod = true;
+	if (mode == FP_AUTO || mode == FP_LLVM || mode == FP_XBYAK) {
+		const char *pStr = "0xfffffffffffffffffffffffffffffffeffffffffffffffff";
+		bool b;
+		mpz_class p192;
+		gmp::setStr(&b, p192, pStr);
+		if (b && mp == p192) {
+			primeMode = PM_NIST_P192;
+			isMont = false;
+			isFastMod = true;
+		}
 	}
-	if ((mode == FP_AUTO || mode == FP_LLVM || mode == FP_XBYAK)
-		&& mp == mpz_class("0x1ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff")) {
-		primeMode = PM_NIST_P521;
-		isMont = false;
-		isFastMod = true;
+	if (mode == FP_AUTO || mode == FP_LLVM || mode == FP_XBYAK) {
+		const char *pStr = "0x1ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff";
+		bool b;
+		mpz_class p521;
+		gmp::setStr(&b, p521, pStr);
+		if (b && mp == p521) {
+			primeMode = PM_NIST_P521;
+			isMont = false;
+			isFastMod = true;
+		}
 	}
 #endif
 #if defined(MCL_USE_VINT) && MCL_SIZEOF_UNIT == 8
-	{
+	if (mode != FP_LLVM && mode != FP_XBYAK) {
 		const char *secp256k1Str = "0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f";
 		bool b;
 		mpz_class secp256k1;
@@ -476,6 +486,7 @@ bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size
 		sq.set(&b, mp);
 		if (!b) return false;
 	}
+	modp.init(mp);
 	return fp::initForMont(*this, p, mode);
 }
 
@@ -528,6 +539,27 @@ int detectIoMode(int ioMode, const std::ios_base& ios)
 bool copyAndMask(Unit *y, const void *x, size_t xByteSize, const Op& op, MaskMode maskMode)
 {
 	const size_t fpByteSize = sizeof(Unit) * op.N;
+	if (maskMode == Mod) {
+		if (xByteSize > fpByteSize * 2) return false;
+		mpz_class mx;
+		bool b;
+		gmp::setArray(&b, mx, (const char*)x, xByteSize);
+		if (!b) return false;
+#ifdef MCL_USE_VINT
+		op.modp.modp(mx, mx);
+#else
+		mx %= op.mp;
+#endif
+		const Unit *pmx = gmp::getUnit(mx);
+		size_t i = 0;
+		for (const size_t n = gmp::getUnitSize(mx); i < n; i++) {
+			y[i] = pmx[i];
+		}
+		for (; i < op.N; i++) {
+			y[i] = 0;
+		}
+		return true;
+	}
 	if (xByteSize > fpByteSize) {
 		if (maskMode == NoMask) return false;
 		xByteSize = fpByteSize;
diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp
index a018c03508..97bb886184 100644
--- a/src/fp_generator.hpp
+++ b/src/fp_generator.hpp
@@ -50,7 +50,7 @@ struct MixPack {
 	}
 	void init(Xbyak::util::Pack& remain, size_t& rspPos, size_t n, size_t useRegNum = useAll)
 	{
-		size_t pn = std::min(remain.size(), n);
+		size_t pn = (std::min)(remain.size(), n);
 		if (useRegNum != useAll && useRegNum < pn) pn = useRegNum;
 		this->mn = n - pn;
 		this->m = Xbyak::util::rsp + rspPos;
@@ -307,19 +307,20 @@ struct FpGenerator : Xbyak::CodeGenerator {
 		useMulx_ = cpu_.has(Xbyak::util::Cpu::tBMI2);
 		useAdx_ = cpu_.has(Xbyak::util::Cpu::tADX);
 	}
-	void init(Op& op)
+	bool init(Op& op)
 	{
+		if (!cpu_.has(Xbyak::util::Cpu::tAVX)) return false;
 		reset(); // reset jit code for reuse
 		setProtectModeRW(); // read/write memory
 		init_inner(op);
 //		printf("code size=%d\n", (int)getSize());
 		setProtectModeRE(); // set read/exec memory
+		return true;
 	}
 private:
 	void init_inner(Op& op)
 	{
 		op_ = &op;
-		if (!cpu_.has(Xbyak::util::Cpu::tAVX)) return;
 		L(pL_);
 		p_ = reinterpret_cast<const uint64_t*>(getCurr());
 		for (size_t i = 0; i < op.N; i++) {
@@ -567,7 +568,7 @@ struct FpGenerator : Xbyak::CodeGenerator {
 	void gen_mulUnit()
 	{
 //		assert(pn_ >= 2);
-		const int regNum = useMulx_ ? 2 : (1 + std::min(pn_ - 1, 8));
+		const int regNum = useMulx_ ? 2 : (1 + (std::min)(pn_ - 1, 8));
 		const int stackSize = useMulx_ ? 0 : (pn_ - 1) * 8;
 		StackFrame sf(this, 3, regNum | UseRDX, stackSize);
 		const Reg64& pz = sf.p[0];
@@ -1298,7 +1299,7 @@ struct FpGenerator : Xbyak::CodeGenerator {
 	void gen_montMulN(const uint64_t *p, uint64_t pp, int n)
 	{
 		assert(1 <= pn_ && pn_ <= 9);
-		const int regNum = useMulx_ ? 4 : 3 + std::min(n - 1, 7);
+		const int regNum = useMulx_ ? 4 : 3 + (std::min)(n - 1, 7);
 		const int stackSize = (n * 3 + (isFullBit_ ? 2 : 1)) * 8;
 		StackFrame sf(this, 3, regNum | UseRDX, stackSize);
 		const Reg64& pz = sf.p[0];
diff --git a/src/gen.cpp b/src/gen.cpp
index 763f64b986..cd36901406 100644
--- a/src/gen.cpp
+++ b/src/gen.cpp
@@ -57,7 +57,7 @@ struct Code : public mcl::Generator {
 			return;
 		}
 		const size_t n = r.bit / unit;
-		for (size_t i = 0; i < n; i++) {
+		for (uint32_t i = 0; i < n; i++) {
 			store(trunc(r, unit), getelementptr(p, i));
 			if (i < n - 1) {
 				r = lshr(r, unit);
@@ -73,7 +73,7 @@ struct Code : public mcl::Generator {
 			p = getelementptr(p, offset);
 		}
 		Operand v = load(p);
-		for (size_t i = 1; i < n; i++) {
+		for (uint32_t i = 1; i < n; i++) {
 			v = zext(v, v.bit + unit);
 			Operand t = load(getelementptr(p, i));
 			t = zext(t, v.bit);
@@ -320,7 +320,7 @@ struct Code : public mcl::Generator {
 		br(c, zero, nonzero);
 	putLabel(zero);
 		for (uint32_t i = 0; i < n + 1; i++) {
-			storeN(makeImm(unit, 0), py, i); 
+			storeN(makeImm(unit, 0), py, i);
 		}
 		ret(Void);
 	putLabel(nonzero);
@@ -336,7 +336,7 @@ struct Code : public mcl::Generator {
 		mcl_fp_sqr_NIST_P192 = Function("mcl_fp_sqr_NIST_P192L" + suf, Void, py, px);
 		verifyAndSetPrivate(mcl_fp_sqr_NIST_P192);
 		beginFunc(mcl_fp_sqr_NIST_P192);
-		Operand buf = _alloca(unit, 192 * 2 / unit);
+		Operand buf = alloca_(unit, 192 * 2 / unit);
 		// QQQ define later
 		Function mcl_fpDbl_sqrPre("mcl_fpDbl_sqrPre" + cybozu::itoa(192 / unit) + "L" + suf, Void, buf, px);
 		call(mcl_fpDbl_sqrPre, buf, px);
@@ -353,7 +353,7 @@ struct Code : public mcl::Generator {
 		Function f("mcl_fp_mulNIST_P192L" + suf, Void, pz, px, py);
 		verifyAndSetPrivate(f);
 		beginFunc(f);
-		Operand buf = _alloca(unit, 192 * 2 / unit);
+		Operand buf = alloca_(unit, 192 * 2 / unit);
 		// QQQ define later
 		Function mcl_fpDbl_mulPre("mcl_fpDbl_mulPre" + cybozu::itoa(192 / unit) + "L" + suf, Void, buf, px, py);
 		call(mcl_fpDbl_mulPre, buf, px, py);
@@ -629,8 +629,8 @@ struct Code : public mcl::Generator {
 		Operand x = px[0];
 		for (size_t i = 1; i < n; i++) {
 			Operand y = px[i];
-			size_t shift = x.bit;
-			size_t size = x.bit + y.bit;
+			uint32_t shift = x.bit;
+			uint32_t size = x.bit + y.bit;
 			x = zext(x, size);
 			y = zext(y, size);
 			y = shl(y, shift);
@@ -713,7 +713,7 @@ struct Code : public mcl::Generator {
 			Operand d = zext(loadN(py, H), half + unit);
 			Operand t1 = add(a, b);
 			Operand t2 = add(c, d);
-			Operand buf = _alloca(unit, N);
+			Operand buf = alloca_(unit, N);
 			Operand t1L = trunc(t1, half);
 			Operand t2L = trunc(t2, half);
 			Operand c1 = trunc(lshr(t1, half), 1);
@@ -721,8 +721,8 @@ struct Code : public mcl::Generator {
 			Operand c0 = _and(c1, c2);
 			c1 = select(c1, t2L, makeImm(half, 0));
 			c2 = select(c2, t1L, makeImm(half, 0));
-			Operand buf1 = _alloca(unit, half / unit);
-			Operand buf2 = _alloca(unit, half / unit);
+			Operand buf1 = alloca_(unit, half / unit);
+			Operand buf2 = alloca_(unit, half / unit);
 			storeN(t1L, buf1);
 			storeN(t2L, buf2);
 			call(mcl_fpDbl_mulPreM[N / 2], buf, buf1, buf2);
@@ -878,7 +878,6 @@ struct Code : public mcl::Generator {
 		Operand p = loadN(pp, N);
 		Operand xy = loadN(pxy, N * 2);
 		Operand t = zext(xy, b2 + unit);
-		Operand z;
 		for (uint32_t i = 0; i < N; i++) {
 			Operand z = trunc(t, unit);
 			Operand q = mul(z, rp);
@@ -891,7 +890,7 @@ struct Code : public mcl::Generator {
 		p = zext(p, bu);
 		Operand vc = sub(t, p);
 		Operand c = trunc(lshr(vc, bit), 1);
-		z = select(c, t, vc);
+		Operand z = select(c, t, vc);
 		z = trunc(z, bit);
 		storeN(z, pz);
 		ret(Void);
diff --git a/src/llvm_gen.hpp b/src/llvm_gen.hpp
index bbc5b90305..e60b8e9a6f 100644
--- a/src/llvm_gen.hpp
+++ b/src/llvm_gen.hpp
@@ -146,7 +146,7 @@ struct Generator {
 	Eval load(const Operand& p);
 	void store(const Operand& r, const Operand& p);
 	Eval select(const Operand& c, const Operand& r1, const Operand& r2);
-	Eval _alloca(uint32_t bit, uint32_t n);
+	Eval alloca_(uint32_t bit, uint32_t n);
 	// QQQ : type of type must be Type
 	Eval bitcast(const Operand& r, const Operand& type);
 	Eval icmp(CondType type, const Operand& r1, const Operand& r2);
@@ -524,7 +524,7 @@ inline Generator::Eval Generator::select(const Generator::Operand& c, const Gene
 	return e;
 }
 
-inline Generator::Eval Generator::_alloca(uint32_t bit, uint32_t n)
+inline Generator::Eval Generator::alloca_(uint32_t bit, uint32_t n)
 {
 	Eval e;
 	e.op = Operand(IntPtr, bit);
diff --git a/src/low_func.hpp b/src/low_func.hpp
index 57c63cfa3e..89a748e594 100644
--- a/src/low_func.hpp
+++ b/src/low_func.hpp
@@ -15,17 +15,27 @@
 	#pragma warning(disable : 4127)
 #endif
 
+#ifndef MCL_LLVM_BMI2
+	#if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && defined(MCL_USE_XBYAK) && !defined(MCL_USE_VINT)
+		#define MCL_LLVM_BMI2 1
+	#endif
+#endif
+
 namespace mcl { namespace fp {
 
 struct Gtag; // GMP
 struct Ltag; // LLVM
+#if MCL_LLVM_BMI2 == 1
 struct LBMI2tag; // LLVM with Intel BMI2 instruction
+#endif
 struct Atag; // asm
 
 template<class Tag> struct TagToStr { };
 template<> struct TagToStr<Gtag> { static const char *f() { return "Gtag"; } };
 template<> struct TagToStr<Ltag> { static const char *f() { return "Ltag"; } };
+#if MCL_LLVM_BMI2 == 1
 template<> struct TagToStr<LBMI2tag> { static const char *f() { return "LBMI2tag"; } };
+#endif
 template<> struct TagToStr<Atag> { static const char *f() { return "Atag"; } };
 
 template<size_t N>
diff --git a/src/low_func_llvm.hpp b/src/low_func_llvm.hpp
index 8a44c22772..a9e8a98cad 100644
--- a/src/low_func_llvm.hpp
+++ b/src/low_func_llvm.hpp
@@ -41,7 +41,7 @@ template<>const void3u MontRed<n, tag>::f = &mcl_fp_montRed ## n ## suf; \
 template<>const void4u DblAdd<n, tag>::f = &mcl_fpDbl_add ## n ## suf; \
 template<>const void4u DblSub<n, tag>::f = &mcl_fpDbl_sub ## n ## suf; \
 
-#if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && !defined(MCL_USE_VINT)
+#if MCL_LLVM_BMI2 == 1
 #define MCL_DEF_LLVM_FUNC(n) \
 	MCL_DEF_LLVM_FUNC2(n, Ltag, L) \
 	MCL_DEF_LLVM_FUNC2(n, LBMI2tag, Lbmi2)
diff --git a/src/she_c384_256.cpp b/src/she_c384_256.cpp
new file mode 100644
index 0000000000..d15ef9d192
--- /dev/null
+++ b/src/she_c384_256.cpp
@@ -0,0 +1,3 @@
+#define MCLBN_FP_UNIT_SIZE 6
+#define MCLBN_FR_UNIT_SIZE 4
+#include "she_c_impl.hpp"
diff --git a/src/she_c_impl.hpp b/src/she_c_impl.hpp
index 073bc2b346..8cfa6d0cb4 100644
--- a/src/she_c_impl.hpp
+++ b/src/she_c_impl.hpp
@@ -274,6 +274,41 @@ int sheEncGT(sheCipherTextGT *c, const shePublicKey *pub, mclInt m)
 	return encT(c, pub, m);
 }
 
+bool setArray(mpz_class& m, const void *buf, mclSize bufSize)
+{
+	if (bufSize > Fr::getUnitSize() * sizeof(mcl::fp::Unit)) return false;
+	bool b;
+	mcl::gmp::setArray(&b, m, (const uint8_t*)buf, bufSize);
+	return b;
+}
+
+template<class CT>
+int encIntVecT(CT *c, const shePublicKey *pub, const void *buf, mclSize bufSize)
+	try
+{
+	mpz_class m;
+	if (!setArray(m, buf, bufSize)) return -1;
+	cast(pub)->enc(*cast(c), m);
+	return 0;
+} catch (std::exception&) {
+	return -1;
+}
+
+int sheEncIntVecG1(sheCipherTextG1 *c, const shePublicKey *pub, const void *buf, mclSize bufSize)
+{
+	return encIntVecT(c, pub, buf, bufSize);
+}
+
+int sheEncIntVecG2(sheCipherTextG2 *c, const shePublicKey *pub, const void *buf, mclSize bufSize)
+{
+	return encIntVecT(c, pub, buf, bufSize);
+}
+
+int sheEncIntVecGT(sheCipherTextGT *c, const shePublicKey *pub, const void *buf, mclSize bufSize)
+{
+	return encIntVecT(c, pub, buf, bufSize);
+}
+
 template<class CT, class PK>
 int encWithZkpBinT(CT *c, sheZkpBin *zkp, const PK *pub, int m)
 	try
@@ -512,6 +547,33 @@ int sheMulGT(sheCipherTextGT *z, const sheCipherTextGT *x, mclInt y)
 	return mulT(*cast(z), *cast(x), y);
 }
 
+template<class CT>
+int mulIntVecT(CT& z, const CT& x, const void *buf, mclSize bufSize)
+	try
+{
+	mpz_class m;
+	if (!setArray(m, buf, bufSize)) return -1;
+	CT::mul(z, x, m);
+	return 0;
+} catch (std::exception&) {
+	return -1;
+}
+
+int sheMulIntVecG1(sheCipherTextG1 *z, const sheCipherTextG1 *x, const void *buf, mclSize bufSize)
+{
+	return mulIntVecT(*cast(z), *cast(x), buf, bufSize);
+}
+
+int sheMulIntVecG2(sheCipherTextG2 *z, const sheCipherTextG2 *x, const void *buf, mclSize bufSize)
+{
+	return mulIntVecT(*cast(z), *cast(x), buf, bufSize);
+}
+
+int sheMulIntVecGT(sheCipherTextGT *z, const sheCipherTextGT *x, const void *buf, mclSize bufSize)
+{
+	return mulIntVecT(*cast(z), *cast(x), buf, bufSize);
+}
+
 int sheMul(sheCipherTextGT *z, const sheCipherTextG1 *x, const sheCipherTextG2 *y)
 {
 	return mulT(*cast(z), *cast(x), *cast(y));
@@ -627,6 +689,33 @@ int shePrecomputedPublicKeyEncGT(sheCipherTextGT *c, const shePrecomputedPublicK
 	return pEncT(c, pub, m);
 }
 
+template<class CT>
+int pEncIntVecT(CT *c, const shePrecomputedPublicKey *pub, const void *buf, mclSize bufSize)
+	try
+{
+	mpz_class m;
+	if (!setArray(m, buf, bufSize)) return -1;
+	cast(pub)->enc(*cast(c), m);
+	return 0;
+} catch (std::exception&) {
+	return -1;
+}
+
+int shePrecomputedPublicKeyEncIntVecG1(sheCipherTextG1 *c, const shePrecomputedPublicKey *pub, const void *buf, mclSize bufSize)
+{
+	return pEncIntVecT(c, pub, buf, bufSize);
+}
+
+int shePrecomputedPublicKeyEncIntVecG2(sheCipherTextG2 *c, const shePrecomputedPublicKey *pub, const void *buf, mclSize bufSize)
+{
+	return pEncIntVecT(c, pub, buf, bufSize);
+}
+
+int shePrecomputedPublicKeyEncIntVecGT(sheCipherTextGT *c, const shePrecomputedPublicKey *pub, const void *buf, mclSize bufSize)
+{
+	return pEncIntVecT(c, pub, buf, bufSize);
+}
+
 template<class PK, class CT>
 int verifyT(const PK& pub, const CT& c, const ZkpBin& zkp)
 	try
diff --git a/src/xbyak/xbyak.h b/src/xbyak/xbyak.h
index bcfeb34bf2..c28a536afc 100644
--- a/src/xbyak/xbyak.h
+++ b/src/xbyak/xbyak.h
@@ -113,7 +113,7 @@ namespace Xbyak {
 
 enum {
 	DEFAULT_MAX_CODE_SIZE = 4096,
-	VERSION = 0x5751 /* 0xABCD = A.BC(D) */
+	VERSION = 0x5790 /* 0xABCD = A.BC(D) */
 };
 
 #ifndef MIE_INTEGER_TYPE_DEFINED
@@ -186,7 +186,8 @@ enum {
 	ERR_INVALID_ZERO,
 	ERR_INVALID_RIP_IN_AUTO_GROW,
 	ERR_INVALID_MIB_ADDRESS,
-	ERR_INTERNAL
+	ERR_INTERNAL,
+	ERR_X2APIC_IS_NOT_SUPPORTED
 };
 
 class Error : public std::exception {
@@ -248,6 +249,7 @@ class Error : public std::exception {
 			"invalid rip in AutoGrow",
 			"invalid mib address",
 			"internal error",
+			"x2APIC is not supported"
 		};
 		assert((size_t)err_ < sizeof(errTbl) / sizeof(*errTbl));
 		return errTbl[err_];
@@ -431,7 +433,8 @@ class Operand {
 		kind_ = kind;
 		bit_ = kind == XMM ? 128 : kind == YMM ? 256 : 512;
 	}
-	void setBit(int bit) { bit_ = bit; }
+	// err if MMX/FPU/OPMASK/BNDREG
+	void setBit(int bit);
 	void setOpmaskIdx(int idx, bool ignore_idx0 = false)
 	{
 		if (!ignore_idx0 && idx == 0) throw Error(ERR_K0_IS_INVALID);
@@ -514,6 +517,48 @@ class Operand {
 	const Reg& getReg() const;
 };
 
+inline void Operand::setBit(int bit)
+{
+	if (bit != 8 && bit != 16 && bit != 32 && bit != 64 && bit != 128 && bit != 256 && bit != 512) goto ERR;
+	if (isBit(bit)) return;
+	if (is(MEM)) {
+		bit_ = bit;
+		return;
+	}
+	if (is(REG | XMM | YMM | ZMM)) {
+		int idx = getIdx();
+		// err if converting ah, bh, ch, dh
+		if (isREG(8) && (4 <= idx && idx < 8) && !isExt8bit()) goto ERR;
+		Kind kind = REG;
+		switch (bit) {
+		case 8:
+			if (idx >= 16) goto ERR;
+#ifdef XBYAK32
+			if (idx >= 4) goto ERR;
+#else
+			if (4 <= idx && idx < 8) idx |= EXT8BIT;
+#endif
+			break;
+		case 16:
+		case 32:
+		case 64:
+			if (idx >= 16) goto ERR;
+			break;
+		case 128: kind = XMM; break;
+		case 256: kind = YMM; break;
+		case 512: kind = ZMM; break;
+		}
+		idx_ = idx;
+		kind_ = kind;
+		bit_ = bit;
+		mask_ = 0;
+		rounding_ = 0;
+		return;
+	}
+ERR:
+	throw Error(ERR_CANT_CONVERT);
+}
+
 class Label;
 
 struct Reg8;
@@ -526,7 +571,8 @@ class Reg : public Operand {
 public:
 	Reg() { }
 	Reg(int idx, Kind kind, int bit = 0, bool ext8bit = false) : Operand(idx, kind, bit, ext8bit) { }
-	Reg changeBit(int bit) const { return Reg(getIdx(), getKind(), bit, isExt8bit()); }
+	// convert to Reg8/Reg16/Reg32/Reg64/XMM/YMM/ZMM
+	Reg changeBit(int bit) const { Reg r(*this); r.setBit(bit); return r; }
 	uint8 getRexW() const { return isREG(64) ? 8 : 0; }
 	uint8 getRexR() const { return isExtIdx() ? 4 : 0; }
 	uint8 getRexX() const { return isExtIdx() ? 2 : 0; }
@@ -650,34 +696,23 @@ struct RegRip {
 
 inline Reg8 Reg::cvt8() const
 {
-	const int idx = getIdx();
-	if (isBit(8)) return Reg8(idx, isExt8bit());
-#ifdef XBYAK32
-	if (idx >= 4) throw Error(ERR_CANT_CONVERT);
-#endif
-	return Reg8(idx, 4 <= idx && idx < 8);
+	Reg r = changeBit(8); return Reg8(r.getIdx(), r.isExt8bit());
 }
 
 inline Reg16 Reg::cvt16() const
 {
-	const int idx = getIdx();
-	if (isBit(8) && (4 <= idx && idx < 8) && !isExt8bit()) throw Error(ERR_CANT_CONVERT);
-	return Reg16(idx);
+	return Reg16(changeBit(16).getIdx());
 }
 
 inline Reg32 Reg::cvt32() const
 {
-	const int idx = getIdx();
-	if (isBit(8) && (4 <= idx && idx < 8) && !isExt8bit()) throw Error(ERR_CANT_CONVERT);
-	return Reg32(idx);
+	return Reg32(changeBit(32).getIdx());
 }
 
 #ifdef XBYAK64
 inline Reg64 Reg::cvt64() const
 {
-	const int idx = getIdx();
-	if (isBit(8) && (4 <= idx && idx < 8) && !isExt8bit()) throw Error(ERR_CANT_CONVERT);
-	return Reg64(idx);
+	return Reg64(changeBit(64).getIdx());
 }
 #endif
 
diff --git a/src/xbyak/xbyak_mnemonic.h b/src/xbyak/xbyak_mnemonic.h
index 766f2f6ecd..2733c61243 100644
--- a/src/xbyak/xbyak_mnemonic.h
+++ b/src/xbyak/xbyak_mnemonic.h
@@ -1,4 +1,4 @@
-const char *getVersionString() const { return "5.751"; }
+const char *getVersionString() const { return "5.79"; }
 void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); }
 void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
 void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
@@ -1676,8 +1676,8 @@ void vbroadcasti32x4(const Ymm& y, const Operand& op) { opAVX_X_XM_IMM(y, op, T_
 void vbroadcasti32x8(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N32, 0x5B); }
 void vbroadcasti64x2(const Ymm& y, const Operand& op) { opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N16, 0x5A); }
 void vbroadcasti64x4(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N32, 0x5B); }
-void vcmppd(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0xC2, imm); }
-void vcmpps(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_0F | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0xC2, imm); }
+void vcmppd(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0xC2, imm); }
+void vcmpps(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_0F | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0xC2, imm); }
 void vcmpsd(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_N8 | T_F2 | T_0F | T_EW1 | T_SAE_Z | T_MUST_EVEX, 0xC2, imm); }
 void vcmpss(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_N4 | T_F3 | T_0F | T_EW0 | T_SAE_Z | T_MUST_EVEX, 0xC2, imm); }
 void vcompressb(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x63); }
@@ -1725,8 +1725,8 @@ void vfixupimmpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { o
 void vfixupimmps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x54, imm); }
 void vfixupimmsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_Z | T_MUST_EVEX, 0x55, imm); }
 void vfixupimmss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_Z | T_MUST_EVEX, 0x55, imm); }
-void vfpclasspd(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isBit(128|256|512)) throw Error(ERR_BAD_MEM_SIZE); Reg x = k; x.setBit(op.getBit()); opVex(x, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm); }
-void vfpclassps(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isBit(128|256|512)) throw Error(ERR_BAD_MEM_SIZE); Reg x = k; x.setBit(op.getBit()); opVex(x, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm); }
+void vfpclasspd(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isBit(128|256|512)) throw Error(ERR_BAD_MEM_SIZE); opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm); }
+void vfpclassps(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isBit(128|256|512)) throw Error(ERR_BAD_MEM_SIZE); opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm); }
 void vfpclasssd(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isXMEM()) throw Error(ERR_BAD_MEM_SIZE); opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW1 | T_N8, 0x67, imm); }
 void vfpclassss(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isXMEM()) throw Error(ERR_BAD_MEM_SIZE); opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_N4, 0x67, imm); }
 void vgatherdpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x92, 1); }
diff --git a/src/xbyak/xbyak_util.h b/src/xbyak/xbyak_util.h
index 01544501dd..c2474c5b02 100644
--- a/src/xbyak/xbyak_util.h
+++ b/src/xbyak/xbyak_util.h
@@ -9,6 +9,11 @@
 */
 #include "xbyak.h"
 
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+	#define XBYAK_INTEL_CPU_SPECIFIC
+#endif
+
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
 #ifdef _MSC_VER
 	#if (_MSC_VER < 1400) && defined(XBYAK32)
 		static inline __declspec(naked) void __cpuid(int[4], int)
@@ -47,14 +52,30 @@
 		#endif
 	#endif
 #endif
+#endif
 
 namespace Xbyak { namespace util {
 
+typedef enum {
+   SmtLevel = 1,
+   CoreLevel = 2
+} IntelCpuTopologyLevel;
+
 /**
 	CPU detection class
 */
 class Cpu {
 	uint64 type_;
+	//system topology
+	bool x2APIC_supported_;
+	static const size_t maxTopologyLevels = 2;
+	unsigned int numCores_[maxTopologyLevels];
+
+	static const unsigned int maxNumberCacheLevels = 10;
+	unsigned int dataCacheSize_[maxNumberCacheLevels];
+	unsigned int coresSharignDataCache_[maxNumberCacheLevels];
+	unsigned int dataCacheLevels_;
+
 	unsigned int get32bitAsBE(const char *x) const
 	{
 		return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
@@ -65,7 +86,7 @@ class Cpu {
 	}
 	void setFamily()
 	{
-		unsigned int data[4];
+		unsigned int data[4] = {};
 		getCpuid(1, data);
 		stepping = data[0] & mask(4);
 		model = (data[0] >> 4) & mask(4);
@@ -88,6 +109,39 @@ class Cpu {
 	{
 		return (val >> base) & ((1u << (end - base)) - 1);
 	}
+	void setNumCores()
+	{
+		if ((type_ & tINTEL) == 0) return;
+
+		unsigned int data[4] = {};
+
+		 /* CAUTION: These numbers are configuration as shipped by Intel. */
+		getCpuidEx(0x0, 0, data);
+		if (data[0] >= 0xB) {
+			 /*
+				if leaf 11 exists(x2APIC is supported),
+				we use it to get the number of smt cores and cores on socket
+
+				leaf 0xB can be zeroed-out by a hypervisor
+			*/
+			x2APIC_supported_ = true;
+			for (unsigned int i = 0; i < maxTopologyLevels; i++) {
+				getCpuidEx(0xB, i, data);
+				IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15);
+				if (level == SmtLevel || level == CoreLevel) {
+					numCores_[level - 1] = extractBit(data[1], 0, 15);
+				}
+			}
+		} else {
+			/*
+				Failed to deremine num of cores without x2APIC support.
+				TODO: USE initial APIC ID to determine ncores.
+			*/
+			numCores_[SmtLevel - 1] = 0;
+			numCores_[CoreLevel - 1] = 0;
+		}
+
+	}
 	void setCacheHierarchy()
 	{
 		if ((type_ & tINTEL) == 0) return;
@@ -96,21 +150,12 @@ class Cpu {
 //		const unsigned int INSTRUCTION_CACHE = 2;
 		const unsigned int UNIFIED_CACHE = 3;
 		unsigned int smt_width = 0;
-		unsigned int n_cores = 0;
-		unsigned int data[4];
-
-		/*
-			if leaf 11 exists, we use it to get the number of smt cores and cores on socket
-			If x2APIC is supported, these are the only correct numbers.
+		unsigned int logical_cores = 0;
+		unsigned int data[4] = {};
 
-			leaf 0xB can be zeroed-out by a hypervisor
-		*/
-		getCpuidEx(0x0, 0, data);
-		if (data[0] >= 0xB) {
-			getCpuidEx(0xB, 0, data); // CPUID for SMT Level
-			smt_width = data[1] & 0x7FFF;
-			getCpuidEx(0xB, 1, data); // CPUID for CORE Level
-			n_cores = data[1] & 0x7FFF;
+		if (x2APIC_supported_) {
+			smt_width = numCores_[0];
+			logical_cores = numCores_[1];
 		}
 
 		/*
@@ -118,29 +163,29 @@ class Cpu {
 			the first level of data cache is not shared (which is the
 			case for every existing architecture) and use this to
 			determine the SMT width for arch not supporting leaf 11.
-			when leaf 4 reports a number of core less than n_cores
+			when leaf 4 reports a number of core less than numCores_
 			on socket reported by leaf 11, then it is a correct number
 			of cores not an upperbound.
 		*/
-		for (int i = 0; data_cache_levels < maxNumberCacheLevels; i++) {
+		for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
 			getCpuidEx(0x4, i, data);
 			unsigned int cacheType = extractBit(data[0], 0, 4);
 			if (cacheType == NO_CACHE) break;
 			if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
-				unsigned int nb_logical_cores = extractBit(data[0], 14, 25) + 1;
-				if (n_cores != 0) { // true only if leaf 0xB is supported and valid
-					nb_logical_cores = (std::min)(nb_logical_cores, n_cores);
+				unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1;
+				if (logical_cores != 0) { // true only if leaf 0xB is supported and valid
+					actual_logical_cores = (std::min)(actual_logical_cores, logical_cores);
 				}
-				assert(nb_logical_cores != 0);
-				data_cache_size[data_cache_levels] =
+				assert(actual_logical_cores != 0);
+				dataCacheSize_[dataCacheLevels_] =
 					(extractBit(data[1], 22, 31) + 1)
 					* (extractBit(data[1], 12, 21) + 1)
 					* (extractBit(data[1], 0, 11) + 1)
 					* (data[2] + 1);
-				if (cacheType == DATA_CACHE && smt_width == 0) smt_width = nb_logical_cores;
+				if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
 				assert(smt_width != 0);
-				cores_sharing_data_cache[data_cache_levels] = (std::max)(nb_logical_cores / smt_width, 1u);
-				data_cache_levels++;
+				coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u);
+				dataCacheLevels_++;
 			}
 		}
 	}
@@ -154,22 +199,25 @@ class Cpu {
 	int displayFamily; // family + extFamily
 	int displayModel; // model + extModel
 
-	// may I move these members into private?
-	static const unsigned int maxNumberCacheLevels = 10;
-	unsigned int data_cache_size[maxNumberCacheLevels];
-	unsigned int cores_sharing_data_cache[maxNumberCacheLevels];
-	unsigned int data_cache_levels;
+	unsigned int getNumCores(IntelCpuTopologyLevel level) {
+		if (!x2APIC_supported_) throw Error(ERR_X2APIC_IS_NOT_SUPPORTED);
+		switch (level) {
+		case SmtLevel: return numCores_[level - 1];
+		case CoreLevel: return numCores_[level - 1] / numCores_[SmtLevel - 1];
+		default: throw Error(ERR_X2APIC_IS_NOT_SUPPORTED);
+		}
+	}
 
-	unsigned int getDataCacheLevels() const { return data_cache_levels; }
+	unsigned int getDataCacheLevels() const { return dataCacheLevels_; }
 	unsigned int getCoresSharingDataCache(unsigned int i) const
 	{
-		if (i >= data_cache_levels) throw  Error(ERR_BAD_PARAMETER);
-		return cores_sharing_data_cache[i];
+		if (i >= dataCacheLevels_) throw  Error(ERR_BAD_PARAMETER);
+		return coresSharignDataCache_[i];
 	}
 	unsigned int getDataCacheSize(unsigned int i) const
 	{
-		if (i >= data_cache_levels) throw  Error(ERR_BAD_PARAMETER);
-		return data_cache_size[i];
+		if (i >= dataCacheLevels_) throw  Error(ERR_BAD_PARAMETER);
+		return dataCacheSize_[i];
 	}
 
 	/*
@@ -177,30 +225,45 @@ class Cpu {
 	*/
 	static inline void getCpuid(unsigned int eaxIn, unsigned int data[4])
 	{
-#ifdef _MSC_VER
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
+	#ifdef _MSC_VER
 		__cpuid(reinterpret_cast<int*>(data), eaxIn);
-#else
+	#else
 		__cpuid(eaxIn, data[0], data[1], data[2], data[3]);
+	#endif
+#else
+		(void)eaxIn;
+		(void)data;
 #endif
 	}
 	static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4])
 	{
-#ifdef _MSC_VER
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
+	#ifdef _MSC_VER
 		__cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
-#else
+	#else
 		__cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
+	#endif
+#else
+		(void)eaxIn;
+		(void)ecxIn;
+		(void)data;
 #endif
 	}
 	static inline uint64 getXfeature()
 	{
-#ifdef _MSC_VER
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
+	#ifdef _MSC_VER
 		return _xgetbv(0);
-#else
+	#else
 		unsigned int eax, edx;
 		// xgetvb is not support on gcc 4.2
 //		__asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
 		__asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
 		return ((uint64)edx << 32) | eax;
+	#endif
+#else
+		return 0;
 #endif
 	}
 	typedef uint64 Type;
@@ -271,9 +334,13 @@ class Cpu {
 
 	Cpu()
 		: type_(NONE)
-		, data_cache_levels(0)
+		, x2APIC_supported_(false)
+		, numCores_()
+		, dataCacheSize_()
+		, coresSharignDataCache_()
+		, dataCacheLevels_(0)
 	{
-		unsigned int data[4];
+		unsigned int data[4] = {};
 		const unsigned int& EAX = data[0];
 		const unsigned int& EBX = data[1];
 		const unsigned int& ECX = data[2];
@@ -363,6 +430,7 @@ class Cpu {
 			if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
 		}
 		setFamily();
+		setNumCores();
 		setCacheHierarchy();
 	}
 	void putFamily() const
@@ -381,12 +449,17 @@ class Clock {
 public:
 	static inline uint64 getRdtsc()
 	{
-#ifdef _MSC_VER
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
+	#ifdef _MSC_VER
 		return __rdtsc();
-#else
+	#else
 		unsigned int eax, edx;
 		__asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
 		return ((uint64)edx << 32) | eax;
+	#endif
+#else
+		// TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu
+		return 0;
 #endif
 	}
 	Clock()
diff --git a/test/bench.hpp b/test/bench.hpp
index 64c792f5ab..cc1639e6e1 100644
--- a/test/bench.hpp
+++ b/test/bench.hpp
@@ -1,5 +1,58 @@
 #include <mcl/lagrange.hpp>
 
+void benchAddDblG1()
+{
+	puts("benchAddDblG1");
+	const int C = 100000;
+	G1 P1, P2, P3;
+	hashAndMapToG1(P1, "a");
+	hashAndMapToG1(P2, "b");
+	P1 += P2;
+	P2 += P1;
+	printf("z.isOne()=%d %d\n", P1.z.isOne(), P2.z.isOne());
+	CYBOZU_BENCH_C("G1::add(1)", C, G1::add, P3, P1, P2);
+	P1.normalize();
+	printf("z.isOne()=%d %d\n", P1.z.isOne(), P2.z.isOne());
+	CYBOZU_BENCH_C("G1::add(2)", C, G1::add, P3, P1, P2);
+	CYBOZU_BENCH_C("G1::add(3)", C, G1::add, P3, P2, P1);
+	P2.normalize();
+	printf("z.isOne()=%d %d\n", P1.z.isOne(), P2.z.isOne());
+	CYBOZU_BENCH_C("G1::add(4)", C, G1::add, P3, P1, P2);
+	P1 = P3;
+	printf("z.isOne()=%d\n", P1.z.isOne());
+	CYBOZU_BENCH_C("G1::dbl(1)", C, G1::dbl, P3, P1);
+	P1.normalize();
+	printf("z.isOne()=%d\n", P1.z.isOne());
+	CYBOZU_BENCH_C("G1::dbl(2)", C, G1::dbl, P3, P1);
+}
+
+void benchAddDblG2()
+{
+	puts("benchAddDblG2");
+	const int C = 100000;
+	G2 P1, P2, P3;
+	hashAndMapToG2(P1, "a");
+	hashAndMapToG2(P2, "b");
+	P1 += P2;
+	P2 += P1;
+	printf("z.isOne()=%d %d\n", P1.z.isOne(), P2.z.isOne());
+	CYBOZU_BENCH_C("G2::add(1)", C, G2::add, P3, P1, P2);
+	P1.normalize();
+	printf("z.isOne()=%d %d\n", P1.z.isOne(), P2.z.isOne());
+	CYBOZU_BENCH_C("G2::add(2)", C, G2::add, P3, P1, P2);
+	CYBOZU_BENCH_C("G2::add(3)", C, G2::add, P3, P2, P1);
+	P2.normalize();
+	printf("z.isOne()=%d %d\n", P1.z.isOne(), P2.z.isOne());
+	CYBOZU_BENCH_C("G2::add(4)", C, G2::add, P3, P1, P2);
+	P1 = P3;
+	printf("z.isOne()=%d\n", P1.z.isOne());
+	CYBOZU_BENCH_C("G2::dbl(1)", C, G2::dbl, P3, P1);
+	P1.normalize();
+	printf("z.isOne()=%d\n", P1.z.isOne());
+	CYBOZU_BENCH_C("G2::dbl(2)", C, G2::dbl, P3, P1);
+}
+
+
 void testBench(const G1& P, const G2& Q)
 {
 	G1 Pa;
@@ -85,6 +138,7 @@ void testBench(const G1& P, const G2& Q)
 	CYBOZU_BENCH_C("finalExp      ", 3000, finalExp, e1, e1);
 //exit(1);
 	std::vector<Fp6> Qcoeff;
+	CYBOZU_BENCH_C("precomputeG2  ", C, precomputeG2, Qcoeff, Q);
 	precomputeG2(Qcoeff, Q);
 	CYBOZU_BENCH_C("precomputedML ", C, precomputedMillerLoop, e2, P, Qcoeff);
 }
@@ -131,4 +185,8 @@ void testLagrange()
 	Fr s;
 	mcl::LagrangeInterpolation(s, x, y, k);
 	CYBOZU_TEST_EQUAL(s, c[0]);
+	mcl::LagrangeInterpolation(s, x, y, 1);
+	CYBOZU_TEST_EQUAL(s, y[0]);
+	mcl::evaluatePolynomial(y[0], c, 1, x[0]);
+	CYBOZU_TEST_EQUAL(y[0], c[0]);
 }
diff --git a/test/bls12_test.cpp b/test/bls12_test.cpp
index 7011516bd8..cc8ddef8aa 100644
--- a/test/bls12_test.cpp
+++ b/test/bls12_test.cpp
@@ -307,6 +307,53 @@ void testTrivial(const G1& P, const G2& Q)
 	CYBOZU_TEST_EQUAL(e, 1);
 }
 
+template<class T>
+void deserializeAndSerialize(const T& x)
+{
+	char buf[1024];
+	size_t n = x.serialize(buf, sizeof(buf));
+	CYBOZU_TEST_ASSERT(n > 0);
+	T y;
+	CYBOZU_TEST_EQUAL(y.deserialize(buf, n), n);
+	CYBOZU_TEST_EQUAL(x, y);
+}
+
+void testSerialize(const G1& P, const G2& Q)
+{
+	Fp::setETHserialization(true); // big endian
+	const struct FpTbl {
+		const char *in;
+		const char out[97];
+	} fpTbl[] = {
+		{
+			"0x12345678901234567",
+			"000000000000000000000000000000000000000000000000000000000000000000000000000000012345678901234567"
+		},
+	};
+	char buf[1024];
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(fpTbl); i++) {
+		Fp x, y;
+		x.setStr(fpTbl[i].in);
+		size_t n = x.serialize(buf, sizeof(buf), mcl::IoSerializeHexStr);
+		CYBOZU_TEST_EQUAL(n, sizeof(fpTbl[i].out) - 1);
+		CYBOZU_TEST_EQUAL_ARRAY(buf, fpTbl[i].out, n);
+		CYBOZU_TEST_EQUAL(y.deserialize(buf, n, mcl::IoSerializeHexStr), n);
+		CYBOZU_TEST_EQUAL(x, y);
+	}
+	deserializeAndSerialize(P);
+	deserializeAndSerialize(-P);
+	G1 zero1;
+	zero1.clear();
+	deserializeAndSerialize(zero1);
+
+	deserializeAndSerialize(Q);
+	deserializeAndSerialize(-Q);
+	G2 zero2;
+	zero2.clear();
+	deserializeAndSerialize(zero2);
+	Fp::setETHserialization(false);
+}
+
 #include "bench.hpp"
 
 CYBOZU_TEST_AUTO(naive)
@@ -325,6 +372,7 @@ CYBOZU_TEST_AUTO(naive)
 		clk.put();
 		return;
 #endif
+		testSerialize(P, Q);
 		testParam(ts);
 		testIo(P, Q);
 //		testFp12pow(P, Q);
diff --git a/test/bn_c_test.hpp b/test/bn_c_test.hpp
index ba4dcf848f..8db329dca3 100644
--- a/test/bn_c_test.hpp
+++ b/test/bn_c_test.hpp
@@ -2,8 +2,10 @@
 	include from bn_if256_test.cpp and bn_if384_test.cpp
 */
 #include <mcl/bn.h>
+#include <mcl/ecparam.hpp>
 #include <cybozu/test.hpp>
 #include <iostream>
+#include <mcl/gmp_util.hpp>
 
 template<size_t N>
 std::ostream& dump(std::ostream& os, const uint64_t (&x)[N])
@@ -132,7 +134,7 @@ CYBOZU_TEST_AUTO(Fr)
 	}
 }
 
-CYBOZU_TEST_AUTO(G1)
+void G1test()
 {
 	mclBnG1 x, y, z;
 	memset(&x, 0x1, sizeof(x));
@@ -149,10 +151,10 @@ CYBOZU_TEST_AUTO(G1)
 
 	char buf[1024];
 	size_t size;
-	size = mclBnG1_getStr(buf, sizeof(buf), &x, 10);
+	size = mclBnG1_getStr(buf, sizeof(buf), &y, 10);
 	CYBOZU_TEST_ASSERT(size > 0);
 	CYBOZU_TEST_EQUAL(size, strlen(buf));
-	CYBOZU_TEST_ASSERT(!mclBnG1_setStr(&y, buf, strlen(buf), 10));
+	CYBOZU_TEST_ASSERT(!mclBnG1_setStr(&x, buf, strlen(buf), 10));
 	CYBOZU_TEST_ASSERT(mclBnG1_isEqual(&x, &y));
 
 	mclBnG1_neg(&x, &x);
@@ -176,6 +178,11 @@ CYBOZU_TEST_AUTO(G1)
 	CYBOZU_TEST_ASSERT(mclBnG1_isEqual(&y, &z));
 }
 
+CYBOZU_TEST_AUTO(G1)
+{
+	G1test();
+}
+
 CYBOZU_TEST_AUTO(G2)
 {
 	mclBnG2 x, y, z;
@@ -560,3 +567,133 @@ CYBOZU_TEST_AUTO(setRandFunc)
 		}
 	}
 }
+
+CYBOZU_TEST_AUTO(Fp)
+{
+	mclBnFp x1, x2;
+	char buf[1024];
+	int ret = mclBnFp_setHashOf(&x1, "abc", 3);
+	CYBOZU_TEST_ASSERT(ret == 0);
+	mclSize n = mclBnFp_serialize(buf, sizeof(buf), &x1);
+	CYBOZU_TEST_ASSERT(n > 0);
+	n = mclBnFp_deserialize(&x2, buf, n);
+	CYBOZU_TEST_ASSERT(n > 0);
+	CYBOZU_TEST_ASSERT(mclBnFp_isEqual(&x1, &x2));
+	for (size_t i = 0; i < n; i++) {
+		buf[i] = char(i);
+	}
+	ret = mclBnFp_setLittleEndian(&x1, buf, n);
+	CYBOZU_TEST_ASSERT(ret == 0);
+	memset(buf, 0, sizeof(buf));
+	n = mclBnFp_serialize(buf, sizeof(buf), &x1);
+	CYBOZU_TEST_ASSERT(n > 0);
+	for (size_t i = 0; i < n - 1; i++) {
+		CYBOZU_TEST_EQUAL(buf[i], char(i));
+	}
+	mclBnFp_clear(&x1);
+	memset(&x2, 0, sizeof(x2));
+	CYBOZU_TEST_ASSERT(mclBnFp_isEqual(&x1, &x2));
+}
+
+CYBOZU_TEST_AUTO(mod)
+{
+	{
+		// Fp
+		char buf[1024];
+		mclBn_getFieldOrder(buf, sizeof(buf));
+		mpz_class p(buf);
+		mpz_class x = mpz_class(1) << (mclBn_getFpByteSize() * 2);
+		mclBnFp y;
+		int ret = mclBnFp_setLittleEndianMod(&y, mcl::gmp::getUnit(x), mcl::gmp::getUnitSize(x) * sizeof(void*));
+		CYBOZU_TEST_EQUAL(ret, 0);
+		mclBnFp_getStr(buf, sizeof(buf), &y, 10);
+		CYBOZU_TEST_EQUAL(mpz_class(buf), x % p);
+	}
+	{
+		// Fr
+		char buf[1024];
+		mclBn_getCurveOrder(buf, sizeof(buf));
+		mpz_class p(buf);
+		mpz_class x = mpz_class(1) << (mclBn_getFrByteSize() * 2);
+		mclBnFr y;
+		int ret = mclBnFr_setLittleEndianMod(&y, mcl::gmp::getUnit(x), mcl::gmp::getUnitSize(x) * sizeof(void*));
+		CYBOZU_TEST_EQUAL(ret, 0);
+		mclBnFr_getStr(buf, sizeof(buf), &y, 10);
+		CYBOZU_TEST_EQUAL(mpz_class(buf), x % p);
+	}
+}
+
+CYBOZU_TEST_AUTO(Fp2)
+{
+	mclBnFp2 x1, x2;
+	char buf[1024];
+	int ret = mclBnFp_setHashOf(&x1.d[0], "abc", 3);
+	CYBOZU_TEST_ASSERT(ret == 0);
+	ret = mclBnFp_setHashOf(&x1.d[1], "xyz", 3);
+	CYBOZU_TEST_ASSERT(ret == 0);
+	mclSize n = mclBnFp2_serialize(buf, sizeof(buf), &x1);
+	CYBOZU_TEST_ASSERT(n > 0);
+	n = mclBnFp2_deserialize(&x2, buf, n);
+	CYBOZU_TEST_ASSERT(n > 0);
+	CYBOZU_TEST_ASSERT(mclBnFp2_isEqual(&x1, &x2));
+	mclBnFp2_clear(&x1);
+	memset(&x2, 0, sizeof(x2));
+	CYBOZU_TEST_ASSERT(mclBnFp2_isEqual(&x1, &x2));
+}
+
+CYBOZU_TEST_AUTO(mapToG1)
+{
+	mclBnFp x;
+	mclBnG1 P1, P2;
+	mclBnFp_setHashOf(&x, "abc", 3);
+	int ret = mclBnFp_mapToG1(&P1, &x);
+	CYBOZU_TEST_ASSERT(ret == 0);
+	mclBnG1_hashAndMapTo(&P2, "abc", 3);
+	CYBOZU_TEST_ASSERT(mclBnG1_isEqual(&P1, &P2));
+}
+
+CYBOZU_TEST_AUTO(mapToG2)
+{
+	mclBnFp2 x;
+	mclBnG2 P1, P2;
+	mclBnFp_setHashOf(&x.d[0], "abc", 3);
+	mclBnFp_clear(&x.d[1]);
+	int ret = mclBnFp2_mapToG2(&P1, &x);
+	CYBOZU_TEST_ASSERT(ret == 0);
+	mclBnG2_hashAndMapTo(&P2, "abc", 3);
+	CYBOZU_TEST_ASSERT(mclBnG2_isEqual(&P1, &P2));
+}
+
+void G1onlyTest(int curve)
+{
+	printf("curve=%d\n", curve);
+	int ret;
+	ret = mclBn_init(curve, MCLBN_COMPILED_TIME_VAR);
+	CYBOZU_TEST_EQUAL(ret, 0);
+	mclBnG1 P0;
+	ret = mclBnG1_getBasePoint(&P0);
+	CYBOZU_TEST_EQUAL(ret, 0);
+	char buf[256];
+	ret = mclBnG1_getStr(buf, sizeof(buf), &P0, 16);
+	CYBOZU_TEST_ASSERT(ret > 0);
+	printf("basePoint=%s\n", buf);
+	G1test();
+}
+
+CYBOZU_TEST_AUTO(G1only)
+{
+	const int tbl[] = {
+		MCL_SECP192K1,
+		MCL_NIST_P192,
+		MCL_SECP224K1,
+		MCL_NIST_P224, // hashAndMapTo is error
+		MCL_SECP256K1,
+		MCL_NIST_P256,
+#if MCLBN_FP_UNIT_SIZE >= 6 && MCLBN_FR_UNIT_SIZE >= 6
+		MCL_SECP384R1,
+#endif
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		G1onlyTest(tbl[i]);
+	}
+}
diff --git a/test/bn_test.cpp b/test/bn_test.cpp
index 929e23517d..071ec706cc 100644
--- a/test/bn_test.cpp
+++ b/test/bn_test.cpp
@@ -118,9 +118,12 @@ void testMapToG1()
 	}
 #ifndef MCL_AVOID_EXCEPTION_TEST
 	if (BN::param.cp.b == 2) {
+		Fp c1;
+		bool b = Fp::squareRoot(c1, -3);
+		CYBOZU_TEST_ASSERT(b);
 		CYBOZU_TEST_EXCEPTION(mapToG1(g, 0), cybozu::Exception);
-		CYBOZU_TEST_EXCEPTION(mapToG1(g, BN::param.mapTo.c1_), cybozu::Exception);
-		CYBOZU_TEST_EXCEPTION(mapToG1(g, -BN::param.mapTo.c1_), cybozu::Exception);
+		CYBOZU_TEST_EXCEPTION(mapToG1(g, c1), cybozu::Exception);
+		CYBOZU_TEST_EXCEPTION(mapToG1(g, -c1), cybozu::Exception);
 	}
 #endif
 }
@@ -349,6 +352,7 @@ void testIo(const G1& P, const G2& Q)
 
 CYBOZU_TEST_AUTO(naive)
 {
+	printf("mcl version=%03x\n", mcl::version);
 	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(g_testSetTbl); i++) {
 		const TestSet& ts = g_testSetTbl[i];
 		printf("i=%d curve=%s\n", int(i), ts.name);
@@ -375,6 +379,8 @@ CYBOZU_TEST_AUTO(naive)
 		testPrecomputed(P, Q);
 		testMillerLoop2(P, Q);
 		testBench(P, Q);
+		benchAddDblG1();
+		benchAddDblG2();
 	}
 	int count = (int)clk.getCount();
 	if (count) {
diff --git a/test/ec_test.cpp b/test/ec_test.cpp
index a78357eac5..ec49adbfe8 100644
--- a/test/ec_test.cpp
+++ b/test/ec_test.cpp
@@ -160,6 +160,42 @@ struct Test {
 			Ec::mul(Q, P, 1);
 			CYBOZU_TEST_EQUAL(P, Q);
 		}
+		{
+			Ec R2;
+			P += P;
+			Q += P;
+			CYBOZU_TEST_ASSERT(!P.z.isOne());
+			CYBOZU_TEST_ASSERT(!Q.z.isOne());
+			Ec::add(R2, P, Q);
+
+			P.normalize();
+			CYBOZU_TEST_ASSERT(P.z.isOne());
+			CYBOZU_TEST_ASSERT(!Q.z.isOne());
+			// affine + generic
+			Ec::add(R, P, Q);
+			CYBOZU_TEST_EQUAL(R, R2);
+			// generic + affine
+			Ec::add(R, Q, P);
+			CYBOZU_TEST_EQUAL(R, R2);
+
+			Q.normalize();
+			CYBOZU_TEST_ASSERT(P.z.isOne());
+			CYBOZU_TEST_ASSERT(Q.z.isOne());
+			// affine + affine
+			Ec::add(R, P, Q);
+			CYBOZU_TEST_EQUAL(R, R2);
+
+			P += P;
+			CYBOZU_TEST_ASSERT(!P.z.isOne());
+			// generic
+			Ec::dbl(R2, P);
+
+			P.normalize();
+			CYBOZU_TEST_ASSERT(P.z.isOne());
+			// affine
+			Ec::dbl(R, P);
+			CYBOZU_TEST_EQUAL(R, R2);
+		}
 	}
 
 	void mul() const
diff --git a/test/elgamal_test.cpp b/test/elgamal_test.cpp
index 9532fc597d..8f27f90107 100644
--- a/test/elgamal_test.cpp
+++ b/test/elgamal_test.cpp
@@ -11,7 +11,7 @@ typedef mcl::EcT<Fp> Ec;
 typedef mcl::ElgamalT<Ec, Zn> ElgamalEc;
 
 const mcl::EcParam& para = mcl::ecparam::secp192k1;
-cybozu::RandomGenerator rg;
+cybozu::RandomGenerator g_rg;
 
 CYBOZU_TEST_AUTO(testEc)
 {
@@ -26,15 +26,15 @@ CYBOZU_TEST_AUTO(testEc)
 		Zn = <P>
 	*/
 	ElgamalEc::PrivateKey prv;
-	prv.init(P, bitSize, rg);
+	prv.init(P, bitSize, g_rg);
 	prv.setCache(0, 60000);
 	const ElgamalEc::PublicKey& pub = prv.getPublicKey();
 
 	const int m1 = 12345;
 	const int m2 = 17655;
 	ElgamalEc::CipherText c1, c2;
-	pub.enc(c1, m1, rg);
-	pub.enc(c2, m2, rg);
+	pub.enc(c1, m1, g_rg);
+	pub.enc(c2, m2, g_rg);
 	Zn dec1, dec2;
 	prv.dec(dec1, c1);
 	prv.dec(dec2, c2);
@@ -69,7 +69,7 @@ CYBOZU_TEST_AUTO(testEc)
 			ss << pub;
 			ss >> pub2;
 		}
-		pub2.enc(cc2, m2, rg);
+		pub2.enc(cc2, m2, g_rg);
 		prv.dec(d, cc2);
 		CYBOZU_TEST_EQUAL(d, m2);
 	}
@@ -86,7 +86,7 @@ CYBOZU_TEST_AUTO(testEc)
 	}
 	// rerandomize
 	c1 = c2;
-	pub.rerandomize(c1, rg);
+	pub.rerandomize(c1, g_rg);
 	// verify c1 != c2
 	CYBOZU_TEST_ASSERT(c1.c1 != c2.c1);
 	CYBOZU_TEST_ASSERT(c1.c2 != c2.c2);
@@ -98,7 +98,7 @@ CYBOZU_TEST_AUTO(testEc)
 	{
 		ElgamalEc::CipherText c;
 		Zn m = 1234;
-		pub.enc(c, m, rg);
+		pub.enc(c, m, g_rg);
 		c.neg();
 		Zn dec;
 		prv.dec(dec, c);
@@ -109,7 +109,7 @@ CYBOZU_TEST_AUTO(testEc)
 		ElgamalEc::CipherText c;
 		Zn m = 123;
 		int x = 111;
-		pub.enc(c, m, rg);
+		pub.enc(c, m, g_rg);
 		Zn dec;
 		prv.dec(dec, c);
 		c.mul(x);
@@ -122,7 +122,7 @@ CYBOZU_TEST_AUTO(testEc)
 	for (int i = -10; i < 10; i++) {
 		ElgamalEc::CipherText c;
 		const Zn mm = i;
-		pub.enc(c, mm, rg);
+		pub.enc(c, mm, g_rg);
 		Zn dec;
 		prv.dec(dec, c, 1000);
 		CYBOZU_TEST_EQUAL(dec, mm);
@@ -131,7 +131,7 @@ CYBOZU_TEST_AUTO(testEc)
 	// isZeroMessage
 	for (int m = 0; m < 10; m++) {
 		ElgamalEc::CipherText c0;
-		pub.enc(c0, m, rg);
+		pub.enc(c0, m, g_rg);
 		if (m == 0) {
 			CYBOZU_TEST_ASSERT(prv.isZeroMessage(c0));
 		} else {
@@ -142,14 +142,14 @@ CYBOZU_TEST_AUTO(testEc)
 	{
 		ElgamalEc::Zkp zkp;
 		ElgamalEc::CipherText c;
-		pub.encWithZkp(c, zkp, 0, rg);
+		pub.encWithZkp(c, zkp, 0, g_rg);
 		CYBOZU_TEST_ASSERT(pub.verify(c, zkp));
-		zkp.s0 += 1;
+		zkp.s[0] += 1;
 		CYBOZU_TEST_ASSERT(!pub.verify(c, zkp));
-		pub.encWithZkp(c, zkp, 1, rg);
+		pub.encWithZkp(c, zkp, 1, g_rg);
 		CYBOZU_TEST_ASSERT(pub.verify(c, zkp));
-		zkp.s0 += 1;
+		zkp.s[0] += 1;
 		CYBOZU_TEST_ASSERT(!pub.verify(c, zkp));
-		CYBOZU_TEST_EXCEPTION_MESSAGE(pub.encWithZkp(c, zkp, 2, rg), cybozu::Exception, "encWithZkp");
+		CYBOZU_TEST_EXCEPTION_MESSAGE(pub.encWithZkp(c, zkp, 2, g_rg), cybozu::Exception, "encWithZkp");
 	}
 }
diff --git a/test/fp_test.cpp b/test/fp_test.cpp
index d8b4742a34..d82c30f5e4 100644
--- a/test/fp_test.cpp
+++ b/test/fp_test.cpp
@@ -347,7 +347,6 @@ void compareTest()
 
 void moduloTest(const char *pStr)
 {
-std::cout << std::hex;
 	std::string str;
 	Fp::getModulo(str);
 	CYBOZU_TEST_EQUAL(str, mcl::gmp::getStr(mpz_class(pStr)));
@@ -565,6 +564,45 @@ void setArrayMaskTest2(mcl::fp::Mode mode)
 	}
 }
 
+void setArrayModTest()
+{
+	const mpz_class& p = Fp::getOp().mp;
+	mpz_class tbl[] = {
+		0, // max
+		0,
+		1,
+		p - 1,
+		p,
+		p + 1,
+		p * 2 - 1,
+		p * 2,
+		p * 2 + 1,
+		p * (p - 1) - 1,
+		p * (p - 1),
+		p * (p - 1) + 1,
+		p * p - 1,
+		p * p,
+		p * p + 1,
+	};
+	std::string maxStr(mcl::gmp::getBitSize(p) * 2, '1');
+	mcl::gmp::setStr(tbl[0], maxStr, 2);
+	const size_t unitByteSize = sizeof(mcl::fp::Unit);
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		const mpz_class& x = tbl[i];
+		const mcl::fp::Unit *px = mcl::gmp::getUnit(x);
+		const size_t xn = mcl::gmp::getUnitSize(x);
+		const size_t xByteSize = xn * unitByteSize;
+		const size_t fpByteSize = unitByteSize * Fp::getOp().N;
+		Fp y;
+		bool b;
+		y.setArray(&b, px, xn, mcl::fp::Mod);
+		bool expected = xByteSize <= fpByteSize * 2;
+		CYBOZU_TEST_EQUAL(b, expected);
+		if (!b) continue;
+		CYBOZU_TEST_EQUAL(y.getMpz(), x % p);
+	}
+}
+
 CYBOZU_TEST_AUTO(set64bit)
 {
 	Fp::init("0x1000000000000000000f");
@@ -777,6 +815,39 @@ void serializeTest()
 	}
 }
 
+void modpTest()
+{
+	const mpz_class& p = Fp::getOp().mp;
+	mpz_class tbl[] = {
+		0, // max
+		0,
+		1,
+		p - 1,
+		p,
+		p + 1,
+		p * 2 - 1,
+		p * 2,
+		p * 2 + 1,
+		p * (p - 1) - 1,
+		p * (p - 1),
+		p * (p - 1) + 1,
+		p * p - 1,
+		p * p,
+		p * p + 1,
+	};
+	std::string maxStr(mcl::gmp::getBitSize(p) * 2, '1');
+	mcl::gmp::setStr(tbl[0], maxStr, 2);
+	mcl::Modp modp;
+	modp.init(p);
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		const mpz_class& x = tbl[i];
+		mpz_class r1, r2;
+		r1 = x % p;
+		modp.modp(r2, x);
+		CYBOZU_TEST_EQUAL(r1, r2);
+	}
+}
+
 #include <iostream>
 #if (defined(MCL_USE_LLVM) || defined(MCL_USE_XBYAK)) && (MCL_MAX_BIT_SIZE >= 521)
 CYBOZU_TEST_AUTO(mod_NIST_P521)
@@ -880,12 +951,14 @@ void sub(mcl::fp::Mode mode)
 		powGmp();
 		setArrayTest1();
 		setArrayMaskTest1();
+		setArrayModTest();
 		getUint64Test();
 		getInt64Test();
 		divBy2Test();
 		getStrTest();
 		setHashOfTest();
 		serializeTest();
+		modpTest();
 	}
 	anotherFpTest(mode);
 	setArrayTest2(mode);
diff --git a/test/fp_tower_test.cpp b/test/fp_tower_test.cpp
index a7123f7a5d..c26c5d7e9a 100644
--- a/test/fp_tower_test.cpp
+++ b/test/fp_tower_test.cpp
@@ -146,6 +146,18 @@ void testFp2()
 		CYBOZU_TEST_ASSERT(Fp2::squareRoot(z, y));
 		CYBOZU_TEST_EQUAL(z * z, y);
 	}
+
+	// serialize
+	for (int i = 0; i < 2; i++) {
+		Fp::setETHserialization(i == 0);
+		Fp2 x, y;
+		x.setStr("0x1234567789345 0x23424324");
+		char buf[256];
+		size_t n = x.serialize(buf, sizeof(buf));
+		CYBOZU_TEST_ASSERT(n > 0);
+		CYBOZU_TEST_EQUAL(y.deserialize(buf, n), n);
+		CYBOZU_TEST_EQUAL(x, y);
+	}
 }
 
 void testFp6sqr(const Fp2& a, const Fp2& b, const Fp2& c, const Fp6& x)
@@ -390,6 +402,7 @@ void test(const char *p, mcl::fp::Mode mode)
 	Fp::init(xi_a, p, mode);
 	printf("mode=%s\n", mcl::fp::ModeToStr(mode));
 	Fp2::init();
+	printf("bitSize=%d\n", (int)Fp::getBitSize());
 #if 0
 	if (Fp::getBitSize() > 256) {
 		printf("not support p=%s\n", p);
@@ -446,7 +459,7 @@ void testAll()
 	};
 	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
 		const char *p = tbl[i];
-		printf("prime=%s %d\n", p, (int)(strlen(p) - 2) * 4);
+		printf("prime=%s\n", p);
 		test(p, mcl::fp::FP_GMP);
 #ifdef MCL_USE_LLVM
 		test(p, mcl::fp::FP_LLVM);
diff --git a/test/glv_test.cpp b/test/glv_test.cpp
index a917f51f47..0e6fccde76 100644
--- a/test/glv_test.cpp
+++ b/test/glv_test.cpp
@@ -165,7 +165,6 @@ void testGLV2()
 	G2 Q0, Q1, Q2;
 	mpz_class z = BN::param.z;
 	mpz_class r = BN::param.r;
-	mpz_class lambda = 6 * z * z;
 	mcl::bn::local::GLV2 glv2;
 	glv2.init(r, z, BN::param.isBLS12);
 	mpz_class n;
diff --git a/test/modp_test.cpp b/test/modp_test.cpp
new file mode 100644
index 0000000000..bf9da38bfa
--- /dev/null
+++ b/test/modp_test.cpp
@@ -0,0 +1,37 @@
+#include <mcl/gmp_util.hpp>
+#include <cybozu/benchmark.hpp>
+#include <cybozu/test.hpp>
+
+#define PUT(x) std::cout << #x << "=" << x << std::endl;
+
+CYBOZU_TEST_AUTO(modp)
+{
+	const int C = 1000000;
+	const char *pTbl[] = {
+		"0x2523648240000001ba344d8000000007ff9f800000000010a10000000000000d",
+		"0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab",
+		"0x73eda753299d7d483339d80809a1d80553bda402fffe5bfeffffffff00000001",
+	};
+	const char *xTbl[] = {
+		"0x12345678892082039482094823",
+		"0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
+		"0x10000000000000000000000000000000000000000000000000000000000000000",
+		"0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
+	};
+	mcl::Modp modp;
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(pTbl); i++) {
+		const mpz_class p(pTbl[i]);
+		std::cout << std::hex << "p=" << p << std::endl;
+		modp.init(p);
+		for (size_t j = 0; j < CYBOZU_NUM_OF_ARRAY(xTbl); j++) {
+			const mpz_class x(xTbl[j]);
+			std::cout << std::hex << "x=" << x << std::endl;
+			mpz_class r1, r2;
+			r1 = x % p;
+			modp.modp(r2, x);
+			CYBOZU_TEST_EQUAL(r1, r2);
+			CYBOZU_BENCH_C("x % p", C, mcl::gmp::mod, r1, x, p);
+			CYBOZU_BENCH_C("modp ", C, modp.modp, r2, x);
+		}
+	}
+}
diff --git a/test/she_c384_256_test.cpp b/test/she_c384_256_test.cpp
new file mode 100644
index 0000000000..d255f50cc2
--- /dev/null
+++ b/test/she_c384_256_test.cpp
@@ -0,0 +1,3 @@
+#define MCLBN_FP_UNIT_SIZE 6
+#define MCLBN_FR_UNIT_SIZE 4
+#include "she_c_test.hpp"
diff --git a/test/she_c_test.hpp b/test/she_c_test.hpp
index 8287c0e0ad..f7709080b6 100644
--- a/test/she_c_test.hpp
+++ b/test/she_c_test.hpp
@@ -9,16 +9,17 @@ const size_t tryNum = 1024;
 
 CYBOZU_TEST_AUTO(init)
 {
-	int curve;
 #if MCLBN_FP_UNIT_SIZE == 4
-	curve = MCL_BN254;
-#elif MCLBN_FP_UNIT_SIZE == 6
-//	curve = MCL_BN381_1;
-	curve = MCL_BLS12_381;
+	int curve = MCL_BN254;
+#elif MCLBN_FP_UNIT_SIZE == 6 && MCLBN_FR_UNIT_SIZE == 4
+	int curve = MCL_BLS12_381;
+#elif MCLBN_FP_UNIT_SIZE == 6 && MCLBN_FR_UNIT_SIZE == 6
+	int curve = MCL_BN381_1;
 #elif MCLBN_FP_UNIT_SIZE == 8
-	curve = MCL_BN462;
+	int curve = MCL_BN462;
 #endif
 	int ret;
+	printf("curve=%d\n", curve);
 	ret = sheInit(curve, MCLBN_COMPILED_TIME_VAR);
 	CYBOZU_TEST_EQUAL(ret, 0);
 	ret = sheSetRangeForDLP(hashSize);
@@ -432,6 +433,40 @@ CYBOZU_TEST_AUTO(ZkpEq)
 	shePrecomputedPublicKeyDestroy(ppub);
 }
 
+template<class CT, class ENC, class ENCV, class DEC, class SUB, class MUL>
+void IntVecTest(const sheSecretKey& sec, const shePublicKey& pub, const ENC& enc, const ENCV& encv, const DEC& dec, const SUB& sub, const MUL& mul, uint8_t *buf, size_t bufSize)
+{
+	CT c1, c2;
+	int ret;
+	ret = encv(&c1, &pub, buf, bufSize);
+	CYBOZU_TEST_EQUAL(ret, 0);
+	buf[0] += 5;
+	enc(&c2, &pub, 1);
+	ret = mul(&c2, &c2, buf, bufSize);
+	CYBOZU_TEST_EQUAL(ret, 0);
+	sub(&c2, &c2, &c1);
+	int64_t d;
+	ret = dec(&d, &sec, &c2);
+	CYBOZU_TEST_EQUAL(ret, 0);
+	CYBOZU_TEST_EQUAL(d, 5);
+}
+
+CYBOZU_TEST_AUTO(IntVec)
+{
+	sheSecretKey sec;
+	sheSecretKeySetByCSPRNG(&sec);
+	shePublicKey pub;
+	sheGetPublicKey(&pub, &sec);
+	uint8_t buf[48];
+	size_t n = 32;
+	for (size_t i = 0; i < sizeof(buf); i++) {
+		buf[i] = uint8_t(i + 5);
+	}
+	IntVecTest<sheCipherTextG1>(sec, pub, sheEncG1, sheEncIntVecG1, sheDecG1, sheSubG1, sheMulIntVecG1, buf, n);
+	IntVecTest<sheCipherTextG2>(sec, pub, sheEncG2, sheEncIntVecG2, sheDecG2, sheSubG2, sheMulIntVecG2, buf, n);
+	IntVecTest<sheCipherTextGT>(sec, pub, sheEncGT, sheEncIntVecGT, sheDecGT, sheSubGT, sheMulIntVecGT, buf, n);
+}
+
 CYBOZU_TEST_AUTO(finalExp)
 {
 	sheSecretKey sec;
diff --git a/test/she_test.cpp b/test/she_test.cpp
index 9292c35f4c..cb64478529 100644
--- a/test/she_test.cpp
+++ b/test/she_test.cpp
@@ -564,7 +564,7 @@ CYBOZU_TEST_AUTO(saveHash)
 static inline void putK(double t) { printf("%.2e\n", t * 1e-3); }
 
 template<class CT>
-void decBench(const char *msg, int C, const SecretKey& sec, const PublicKey& pub, int64_t (SecretKey::*dec)(const CT& c) const = &SecretKey::dec)
+void decBench(const char *msg, int C, const SecretKey& sec, const PublicKey& pub, int64_t (SecretKey::*dec)(const CT& c, bool *pok) const = &SecretKey::dec)
 {
 	int64_t begin = 1 << 20;
 	int64_t end = 1LL << 32;
@@ -573,8 +573,8 @@ void decBench(const char *msg, int C, const SecretKey& sec, const PublicKey& pub
 		int64_t x = begin - 1;
 		pub.enc(c, x);
 		printf("m=%08x ", (uint32_t)x);
-		CYBOZU_BENCH_C(msg, C, (sec.*dec), c);
-		CYBOZU_TEST_EQUAL((sec.*dec)(c), x);
+		CYBOZU_BENCH_C(msg, C, (sec.*dec), c, 0);
+		CYBOZU_TEST_EQUAL((sec.*dec)(c, 0), x);
 		begin *= 2;
 	}
 	int64_t mTbl[] = { -0x80000003ll, 0x80000000ll, 0x80000005ll };
@@ -582,7 +582,7 @@ void decBench(const char *msg, int C, const SecretKey& sec, const PublicKey& pub
 		int64_t m = mTbl[i];
 		CT c;
 		pub.enc(c, m);
-		CYBOZU_TEST_EQUAL((sec.*dec)(c), m);
+		CYBOZU_TEST_EQUAL((sec.*dec)(c, 0), m);
 	}
 }
 
diff --git a/test/vint_test.cpp b/test/vint_test.cpp
index 0eea8a9f14..a2d42197e6 100644
--- a/test/vint_test.cpp
+++ b/test/vint_test.cpp
@@ -1,4 +1,5 @@
 #include <stdio.h>
+#define MCL_MAX_BIT_SIZE 521
 #include <mcl/vint.hpp>
 #include <iostream>
 #include <sstream>
@@ -6,6 +7,9 @@
 #include <cybozu/benchmark.hpp>
 #include <cybozu/test.hpp>
 #include <cybozu/xorshift.hpp>
+#ifndef DONT_USE_GMP_IN_TEST
+#include <gmpxx.h>
+#endif
 
 #define PUT(x) std::cout << #x "=" << x << std::endl;
 
@@ -551,14 +555,70 @@ CYBOZU_TEST_AUTO(quotRem)
 			"0xfffffffffffff0000000000000000000000000000000000000000000000000000000000000001",
 			"521481209941628322292632858916605385658190900090571826892867289394157573281830188869820088065",
 		},
+		{
+			"0x1230000000000000456",
+			"0x1230000000000000457",
+			"0x1230000000000000456",
+		},
+		{
+			"0x1230000000000000456",
+			"0x1230000000000000456",
+			"0",
+		},
+		{
+			"0x1230000000000000456",
+			"0x1230000000000000455",
+			"1",
+		},
+		{
+			"0x1230000000000000456",
+			"0x2000000000000000000",
+			"0x1230000000000000456",
+		},
+		{
+			"0xffffffffffffffffffffffffffffffff",
+			"0x80000000000000000000000000000000",
+			"0x7fffffffffffffffffffffffffffffff",
+		},
+		{
+			"0xffffffffffffffffffffffffffffffff",
+			"0x7fffffffffffffffffffffffffffffff",
+			"1",
+		},
+		{
+			"0xffffffffffffffffffffffffffffffff",
+			"0x70000000000000000000000000000000",
+			"0x1fffffffffffffffffffffffffffffff",
+		},
+		{
+			"0xffffffffffffffffffffffffffffffff",
+			"0x30000000000000000000000000000000",
+			"0x0fffffffffffffffffffffffffffffff",
+		},
+		{
+			"0xffffffffffffffffffffffffffffffff",
+			"0x10000000000000000000000000000000",
+			"0x0fffffffffffffffffffffffffffffff",
+		},
+		{
+			"0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
+			"0x2523648240000001ba344d80000000086121000000000013a700000000000013",
+			"0x212ba4f27ffffff5a2c62effffffffcdb939ffffffffff8a15ffffffffffff8d",
+		},
+		{
+			"0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
+			"0x2523648240000001ba344d8000000007ff9f800000000010a10000000000000d",
+			"0x212ba4f27ffffff5a2c62effffffffd00242ffffffffff9c39ffffffffffffb1",
+		},
 	};
-	mcl::Vint x, y, r;
+	mcl::Vint x, y, q, r1, r2;
 	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
 		x.setStr(tbl[i].x);
 		y.setStr(tbl[i].y);
-		r.setStr(tbl[i].r);
-		x %= y;
-		CYBOZU_TEST_EQUAL(x, r);
+		r1.setStr(tbl[i].r);
+		mcl::Vint::divMod(&q, r2, x, y);
+		CYBOZU_TEST_EQUAL(r1, r2);
+		CYBOZU_TEST_EQUAL(x, q * y + r2);
 	}
 }
 
@@ -1176,6 +1236,36 @@ CYBOZU_TEST_AUTO(bench)
 	CYBOZU_BENCH_C("sub", N, Vint::sub, z, x, y);
 	CYBOZU_BENCH_C("mul", N, Vint::mul, z, x, y);
 	CYBOZU_BENCH_C("div", N, Vint::div, y, z, x);
+
+	const struct {
+		const char *x;
+		const char *y;
+	} tbl[] = {
+		{
+			"0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
+			"0x2523648240000001ba344d8000000007ff9f800000000010a10000000000000d"
+		},
+		{
+			"0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
+			"0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab",
+		},
+		{
+			"0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
+			"0x73eda753299d7d483339d80809a1d80553bda402fffe5bfeffffffff00000001",
+		},
+
+	};
+	for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+		x.setStr(tbl[i].x);
+		y.setStr(tbl[i].y);
+		CYBOZU_BENCH_C("fast div", N, Vint::div, z, x, y);
+#ifndef DONT_USE_GMP_IN_TEST
+		{
+			mpz_class mx(tbl[i].x), my(tbl[i].y), mz;
+			CYBOZU_BENCH_C("gmp", N, mpz_div, mz.get_mpz_t(), mx.get_mpz_t(), my.get_mpz_t());
+		}
+#endif
+	}
 }
 
 struct Seq {