Add WhitespaceTokenizer and UnicodeScriptTokenizer for nlp

add CaseFold, NormalizeUTF8 add RegexReplace add RegexTokenizer add BasicTokenizer add WordpieceTokenizer add BertTokenizer
2020-05-05 16:51:05 +08:00 · 2020-05-05 16:51:05 +08:00 · 4f16f036be
parent ea37dc76f0
commit 4f16f036be
45 changed files with 2944 additions and 10 deletions
--- a/581
+++ b/581
@ -3057,6 +3057,587 @@ Software: tinyxml2 8.0.0
 Copyright 2011, John Resig.
 Copyright 2011, The Dojo Foundation.

+Software: icu 67.1
+Copyright (C) 2000-2004, International Business Machines Corporation
+Copyright (C) 2002-2014, International Business Machines(C) Copyright IBM Corp. 1998-2011 - All Rights Reserved
+Copyright (C) 2003-2008, International Business Machines
+Copyright (C) 2005-2006, International Business Machines
+Copyright (C) 2016 and later: Unicode, Inc. and others.
+Copyright (c) 2001-2010 International Business Machines
+Copyright (C) 2009, International Business Machines
+Copyright (c) 2010-2015 International Business Machines Corporation and others. All rights reserved.
+Copyright (C) 2002-2015, International Business Machines verbatim (minus copyright and #include) and copied together into this file.
+Copyright (c) 1997-2014, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (c) 1997-2008, International Business Machines Corporation and
+Copyright (c) 1997-2003, International Business Machines Corporation and
+Copyright (c) 1996-2012, International Business Machines Corporation and
+Copyright (c) 1997-2016, International Business Machines
+Copyright (c) 1997-2013 International Business Machines
+Copyright (c) 1997-2016, International Business Machines Corporation and
+Copyright (c) 1997-2001, International Business Machines Corporation and
+Copyright (c) 1997-2012, International Business Machines Corporation and
+Copyright (c) 1997-2005, International Business Machines Corporation and
+Copyright (c) 1997-2010, International Business Machines Corporation and
+Copyright (c) 2011-2016, International Business Machines Corporation
+Copyright (c) 1997-2009, International Business Machines Corporation and
+Copyright (c) 1997-2002,2008, International Business Machines Corporation and
+Copyright (c) 1997-2009,2014, International Business Machines
+Copyright (C) 2000-2009, International Business Machines
+Copyright (c) 1997-2015, International Business Machines Corporation and
+Copyright (c) 1997-2013, International Business Machines Corporation and
+Copyright (c) 2001-2016, International Business Machines Corporation and
+Copyright (c) 1997-2016, International Business Machines Corporation
+Copyright (c) 1997-2003, 2007-2009 International Business Machines Corporation and
+Copyright (c) 2011-2014, International Business Machines Corporation
+Copyright (c) 2003-2009, International Business Machines
+Copyright (c) 2016, International Business Machines Corporation
+Copyright (c) 1997-2004, International Business Machines Corporation and
+Copyright (C) 2002-2016, International Business Machines
+Copyright (C) 1998-2014, International Business Machines Corporation
+Copyright (c) 2003-2013, International Business Machines Corporation and
+Copyright (c) 2005-2016, International Business Machines Corporation and
+Copyright (c) 1999-2013, International Business Machines Corporation and
+Copyright (c) 2003-2015, International Business Machines Corporation and
+Copyright (C) 2003-2016, International Business Machines
+Copyright (C) 2003-2014, International Business Machines
+Copyright (C) 2003, International Business Machines
+Copyright (c) 1998-2016, International Business Machines Corporation and
+Copyright (c) 2004-2015, International Business Machines Corporation and
+Copyright (c) 2009-2016, International Business Machines Corporation and
+Copyright (C) 2003-2012, International Business Machines
+Copyright (c) 2000-2016, International Business Machines Corporation and
+Copyright (C) 2001-2014, International Business Machines
+Copyright (C) 2001-2016, International Business Machines
+Copyright (c) 1997-2014, International Business Machines © 2017 and later: Unicode, Inc. and others.
+Copyright (C) 2007-2016, International Business Machines © 2018 and later: Unicode, Inc. and others.
+Copyright (c) 2015, International Business Machines Corporation
+Copyright (c) 2014-2016, International Business Machines Corporation
+Copyright (c) 2002-2016, International Business Machines
+Copyright (c) 2001-2011,2015 International Business Machines
+Copyright (c) 2001-2016 International Business Machines
+Copyright (c) 2005-2013, International Business Machines Corporation and
+Copyright (c) 1998-2014, International Business Machines Corporation and
+Copyright (C) 1997-2016 International Business Machines
+Copyright (C) 2009-2014, International Business Machines Corporation and
+Copyright (c) 2002-2014, International Business Machines Corporation
+Copyright (c) 2002-2007, International Business Machines Corporation
+Copyright (C) 1996-2012, International Business Machines Corporation
+Copyright (C) 1996-2008, International Business Machines Corporation
+Copyright (C) 2007-2013, International Business Machines Corporation and
+Copyright (C) 2008-2015, International Business Machines
+Copyright (C) 2003-2013, International Business Machines Corporation and
+Copyright (C) 2003-2013, International Business Machines Corporation
+Copyright (C) 1997-2016, International Business Machines Corporation and
+Copyright (C) 2001-2011, International Business Machines
+Copyright (C) 2001-2008, International Business Machines
+Copyright (C) 2003 - 2009, International Business Machines Corporation and
+Copyright (C) 2003 - 2008, International Business Machines Corporation and
+Copyright (C) 2007-2014, International Business Machines Corporation
+Copyright (C) 2007-2013, International Business Machines Corporation
+Copyright (C) 1997-2013, International Business Machines Corporation and
+Copyright (C) 1996-2014, International Business Machines Corporation and
+Copyright (C) 2010-2014, International Business Machines
+Copyright (C) 2010-2015, International Business Machines
+Copyright (C) 2013-2014, International Business Machines
+Copyright (C) 1996-2015, International Business Machines
+Copyright (C) 1996-2014, International Business Machines
+Copyright (C) 2012-2015, International Business Machines
+Copyright (C) 2012-2014, International Business Machines
+Copyright (C) 2013-2015, International Business Machines
+Copyright (C) 2013-2016, International Business Machines
+Copyright (C) 1999-2016, International Business Machines
+Copyright (C) 1999-2015, International Business Machines
+Copyright (C) 1999-2014, International Business Machines
+Copyright (C) 2015-2016, International Business Machines Corporation and others.
+Copyright (C) 2003 - 2013, International Business Machines Corporation and
+Copyright (C) 1999-2011, International Business Machines
+Copyright (C) 2005-2016, International Business Machines
+Copyright (C) 2005-2012, International Business Machines
+Copyright (C) 2005-2015, International Business Machines
+Copyright (C) 2005-2013, International Business Machines
+Copyright (C) 2005-2014, International Business Machines
+Copyright (c) 2004, International Business Machines
+Copyright (c) 2004-2014 International Business Machines
+Copyright (c) 2004-2014, International Business Machines
+Copyright (C) 2013, International Business Machines Corporation
+Copyright (C) 1997-2015, International Business Machines Corporation and
+Copyright (C) 2016, International Business Machines
+Copyright (c) IBM Corporation, 2000-2012. All rights reserved.
+Copyright (c) IBM Corporation, 2000-2011. All rights reserved.
+Copyright (c) IBM Corporation, 2000-2014. All rights reserved.
+Copyright (c) IBM Corporation, 2000-2010. All rights reserved.
+Copyright (c) IBM Corporation, 2000-2016. All rights reserved.
+Copyright 2010 the V8 project authors. All rights reserved.
+Copyright 2006-2008 the V8 project authors. All rights reserved.
+Copyright 2012 the V8 project authors. All rights reserved.
+Copyright (C) 2008-2016, International Business Machines Corporation and
+Copyright (C) 2007-2016, International Business Machines Corporation and
+Copyright (C) 2007-2012, International Business Machines Corporation and
+Copyright (c) 2001-2011, International Business Machines
+Copyright (c) 2001-2007, International Business Machines
+Copyright (C) 2010-2014, International Business Machines Corporation and
+Copyright (C) 1997-2010, International Business Machines Corporation and
+Copyright (C) 1997-2012, International Business Machines Corporation and
+Copyright (C) 2009-2015, International Business Machines Corporation and
+Copyright (C) 2009-2012, International Business Machines Corporation and
+Copyright (c) 2002-2012, International Business Machines Corporation
+Copyright (c) 2002-2011, International Business Machines Corporation
+Copyright (C) 2008-2013, International Business Machines Corporation and
+Copyright (c) 2003-2008, International Business Machines
+Copyright (C) 2003-2016, International Business Machines Corporation
+Copyright (C) 2003-2014, International Business Machines Corporation
+Copyright (C) 2003-2008, International Business Machines Corporation
+Copyright (C) 2005-2008, International Business Machines
+Copyright (C) 2003-2015, International Business Machines Corporation
+Copyright (C) 2003-2009,2012,2016 International Business Machines Corporation and
+Copyright (c) 2004-2016, International Business Machines © 2020 and later: Unicode, Inc. and others.
+Copyright (C) 2007-2008, International Business Machines Corporation and
+Copyright (C) 2001-2007, International Business Machines
+Copyright (C) 1997-2012, International Business Machines
+Copyright (C) 1997-2015, International Business Machines
+Copyright (C) 2001-2010, International Business Machines
+Copyright (c) 2000-2005, International Business Machines
+Copyright (c) 2000-2007, International Business Machines © 2019 and later: Unicode, Inc. and others.
+Copyright (C) 2010-2015, International Business Machines Corporation and
+Copyright (C) 2015, International Business Machines Corporation and
+Copyright (c) 2003-2013, International Business Machines
+Copyright (C) 2001-2012, International Business Machines
+Copyright (C) 2001-2011, International Business Machines Corporation
+Copyright (C) 2014-2016, International Business Machines
+Copyright (C) 1997-2015, International Business Machines Corporation
+Copyright (C) 1999-2007, International Business Machines
+Copyright (C) 1999-2007, International Business Machines Corporation
+Copyright (C) 1999-2011, International Business Machines Corporation
+Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 2002-2016 International Business Machines Corporation and others.
+Copyright (C) 2002-2016, International Business Machines Corporation and others.
+Copyright (C) 2002-2016 International Business Machines Corporation
+Copyright (C) 2002-2015, International Business Machines Corporation and others.
+Copyright (C) 2012 International Business Machines Corporation
+Copyright (C) 2002-2015 International Business Machines Corporation
+Copyright (C) 2004-2015, International Business Machines Corporation and others.
+Copyright (C) 2003-2010, International Business Machines Corporation and others.
+Copyright (c) 2008-2011, International Business Machines Corporation and
+Copyright (c) 2008-2010, International Business Machines Corporation and
+Copyright (C) 2014-2016, International Business Machines Corporation and
+Copyright (C) 2013, International Business Machines Corporation and
+Copyright (c) 2014, International Business Machines
+Copyright (C) 2014, International Business Machines
+Copyright (C) 2013, International Business Machines
+Copyright (C) 2001-2008,2010 IBM and others. All rights reserved.
+Copyright (C) 2010 , Yahoo! Inc.
+Copyright (c) 1997-2011, International Business Machines Corporation and
+Copyright (C) 2013-2014, International Business Machines Corporation and
+Copyright (C) 2009-2013, International Business Machines Corporation and
+Copyright (C) 1996-2012, International Business Machines Corporation and
+Copyright (C) 2015, International Business Machines Corporation
+Copyright (c) 2001-2012, International Business Machines Corporation
+Copyright (C) 2001-2014 IBM and others. All rights reserved.
+Copyright (C) 2008-2014, Google, International Business Machines Corporation and
+Copyright (C) 2008, Google, International Business Machines Corporation and
+Copyright (C) 2008-2015, Google, International Business Machines Corporation
+Copyright (c) 2001-2014, International Business Machines
+Copyright (c) 2002-2010, International Business Machines Corporation
+Copyright (C) 2011-2015, International Business Machines Corporation and
+Copyright (C) 2011-2016, International Business Machines Corporation and
+Copyright (C) 2011-2012, International Business Machines Corporation and
+Copyright (C) 1996-2016, International Business Machines
+Copyright (C) 1998-2014, International Business Machines
+Copyright (C) 2004-2016, International Business Machines
+Copyright (C) 2010-2011, International Business Machines
+Copyright (C) 2009-2015, International Business Machines
+Copyright (C) 2015, International Business Machines
+Copyright (C) 2012-2016, International Business Machines
+Copyright (C) 1999-2012, International Business Machines
+Copyright (C) 2001, International Business Machines
+Copyright (C) 2013, International Business Machines Corporation and others.
+Copyright (C) 2010-2012, International Business Machines
+Copyright (C) 2004-2015, International Business Machines
+Copyright (C) 2003-2006, International Business Machines
+Copyright (C) 2013-2015, International Business Machines Corporation and others.
+Copyright (C) 2001-2015 IBM and others. All rights reserved.
+Copyright (C) 2008-2015, International Business Machines Corporation
+Copyright (C) 2008-2016, International Business Machines
+Copyright (C) 2008-2013, International Business Machines Corporation
+Copyright (C) 2004-2012, International Business Machines Corporation and
+Copyright (C) 1997-2009,2014 International Business Machines
+Copyright (C) 2009-2011, International Business Machines Corporation and
+Copyright (C) 2009-2016, International Business Machines Corporation and
+Copyright (C) 2009-2013, International Business Machines
+Copyright (C) 2008-2011, International Business Machines
+Copyright (C) 2007-2014, International Business Machines Corporation and
+Copyright (C) 2009-2010, International Business Machines Corporation and
+Copyright (C) 2001-2016 International Business Machines Corporation
+Copyright (c) 2002-2011, International Business Machines
+Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved.
+Copyright (c) 2013-2016 International Business Machines Corporation and others. All rights reserved.
+Copyright (c) 2013-2015 International Business Machines Corporation and others. All rights reserved.
+Copyright (c) 2007-2012, International Business Machines Corporation and
+Copyright (c) 2007-2012, International Business Machines
+Copyright (C) 2010, International Business Machines
+Copyright (C) 1997-2011, International Business Machines
+Copyright (C) 1997-2005, International Business Machines
+Copyright (C) 2009-2011, International Business Machines
+Copyright (C) 2003-2015, International Business Machines
+Copyright (C) 2009-2016, International Business Machines
+Copyright (C) 2008-2012, International Business Machines
+Copyright (C) 2008, International Business Machines
+Copyright (C) 2011-2014, International Business Machines
+Copyright (C) 2011-2013, International Business Machines
+Copyright (C) 2005, International Business Machines
+Copyright (C) 1999-2013, International Business Machines
+Copyright (C) 1998-2016, International Business Machines
+Copyright (c) 2007-2014, International Business Machines Corporation and
+Copyright (C) 2003-2013, International Business Machines
+Copyright (c) 2007-2016, International Business Machines Corporation and
+Copyright (c) 2008-2015, International Business Machines
+Copyright (C) 1999-2010, International Business Machines
+Copyright (C) 2000-2015, International Business Machines
+Copyright (C) 2000-2011, International Business Machines
+Copyright (C) 2000-2012, International Business Machines
+Copyright (C) 2000-2010, International Business Machines
+Copyright (C) 2004-2010, International Business Machines
+Copyright (C) 2004-2005, International Business Machines
+Copyright (c) 2013-2014, International Business Machines
+Copyright (c) 1991-2013 Unicode, Inc. © 2019 Unicode®, Inc.
+Copyright (C) 2018 and later: Unicode, Inc. and others.
+Copyright (c) 2008-2013 International Business Machines
+Copyright (C) 2002-2010, International Business Machines
+Copyright (c) 2012-2015 International Business Machines © 2020 Unicode®, Inc.
+Copyright (c) 2005-2013 IBM Corporation and others. All rights reserved
+Copyright (c) 2011-2012, International Business Machines Corporation and
+Copyright (C) 1998-2000, International Business Machines © 2017 Unicode®, Inc.
+Copyright (c) 2007-2015 International Business Machines
+Copyright (C) 2004-2006, International Business Machines
+Copyright (C) 2003-2005, International Business Machines
+Copyright (c) 1999-2014 International Business Machines
+Copyright (c) 2003, International Business Machines
+Copyright (C) 2014 International Business Machines
+Copyright (c) 2001-2003 International Business Machines
+Copyright (c) 2004-2011 International Business Machines
+Copyright (C) 2015-2016, International Business Machines
+Copyright (c) 2001-2015 International Business Machines
+Copyright (C) 2003-2012, International Business Machines Corporation and COPYRIGHT AND PERMISSION NOTICE
+Copyright (c) 2003 National Electronics and Computer Technology Center and others
+Copyright (C) 2005-2010, International Business Machines
+Copyright (c) 2007-2009 IBM Corporation and others. All rights reserved
+Copyright (C) 2004-2016 International Business Machines
+Copyright (C) 1998-2013, International Business Machines
+Copyright (C) 1998-2010, International Business Machines
+Copyright (c) 1999-2004, International Business Machines
+Copyright (C) 2002-2006 International Business Machines Corporation
+Copyright (C) 1999-2006, International Business Machines
+Copyright (C) 2002-2016 IBM, Inc. All Rights Reserved.
+Copyright (c) 2002-2006, International Business Machines(C) Copyright IBM Corp. 1998-2007 - All Rights Reserved
+Copyright (C) 1999-2003, International Business Machines
+Copyright (C) 1998-2006, International Business Machines Corporation and
+Copyright (C) 1998-2003, International Business Machines Corporation and
+Copyright (C) 2003 - 2008, International Business Machines
+Copyright (C) 1999-2008, International Business Machines
+Copyright (C) 1999-2001, International Business Machines
+Copyright (C) 1999-2005, International Business Machines
+Copyright (C) 2016 and later: Unicode, Inc. and others.
+Copyright (c) 2001-2010 IBM Corporation and others. All Rights Reserved.
+Copyright (C) 1998-2005, International Business Machines Corporation and
+Copyright (C) 1998-2001, International Business Machines Corporation and
+Copyright (c) 2002-2005, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 2000-2014, International Business Machines
+Copyright (C) 1996-2013, International Business Machines
+Copyright (c) 2002-2006, International Business Machines Corporation and
+Copyright (c) 2004-2010, International Business Machines Corporation and
+Copyright (C) 2004-2011, International Business Machines
+Copyright (c) 2002-2005, International Business Machines Corporation and
+Copyright (c) 2002-2014, International Business Machines
+Copyright (c) 1997-2012, International Business Machines
+Copyright (c) 2002-2008, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 2011-2013, Apple Inc.; Unicode, Inc.; and others. All Rights Reserved.
+Copyright (C) 2011-2013, Apple Inc. and others. All Rights Reserved.
+Copyright (c) 2005-2007,2010 Apple Inc., Unicode Inc.,and others. All Rights Reserved.
+Copyright (c) 1999-2003, International Business Machines Corporation and
+Copyright (c) 2003-2014, International Business Machines
+Copyright (c) 2002-2010, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (c) 1999-2010, International Business Machines Corporation and
+Copyright (c) 1999-2002, International Business Machines Corporation and
+Copyright (C) 2002-2003, International Business Machines
+Copyright (C) 2002, International Business Machines
+Copyright (c) 2007, International Business Machines Corporation and
+Copyright (C) 2007, International Business Machines
+Copyright (C) 2001-2006, International Business Machines
+Copyright (C) 2010-2014, International Business Machines Corporation and others.
+Copyright (C) 2005-2016, International Business Machines Corporation and
+Copyright (C) 2015-2016, International Business Machines Corporation and
+Copyright (C) 2008-2012, International Business Machines Corporation
+Copyright (c) 2006-2015 International Business Machines Corporation and others. All rights reserved.
+Copyright (c) 2014-2015 International Business Machines Corporation and others. All rights reserved.
+Copyright (C) 2002-2011, International Business Machines
+Copyright (c) 2003-2010, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 2012 IBM Corporation and Others. All Rights Reserved.
+Copyright (C) 1998-2012, International Business Machines Corporation
+Copyright (c) 2009, International Business Machines Corporation and
+Copyright (C) The Internet Society (2002). All Rights Reserved.
+Copyright (c) 2015, International Business Machines Corporation and
+Copyright (c) 2002, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 1998-2016, International Business Machines Corporation
+Copyright (c) 2011-2016,International Business Machines
+Copyright (C) 2012 International Business Machines Corporation and Others. All Rights Reserved.
+Copyright (C) 2011, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 2011, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (c) 2011-2012,International Business Machines
+Copyright (c) 2007, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 2007-2007, International Business Machines(C) Copyright IBM Corp. 1998-2014 - All Rights Reserved
+Copyright (C) 1998-2002, International Business Machines
+Copyright (c) 2001-2007, International Business Machines Corporation and others. All Rights Reserved.(C) Copyright IBM Corp. 1998-2013 - All Rights Reserved
+Copyright (C) 1998-2015, International Business Machines
+Copyright (C) 2001-2014 International Business Machines
+Copyright (C) 2011-2016, International Business Machines
+Copyright (C) 2011-2015, International Business Machines
+Copyright (c) 1999-2014, International Business Machines Corporation and
+Copyright (c) 1999-2009, International Business Machines Corporation and
+Copyright (c) 2010,International Business Machines
+Copyright (c) 2010-2016,International Business Machines
+Copyright (c) 2002-2005, International Business Machines
+Copyright (C) 2000-2003, International Business Machines
+Copyright (c) 2008-2014, International Business Machines Corporation and
+Copyright (C) 2001 - 2005, International Business Machines
+Copyright (C) 2001-2005, International Business Machines
+Copyright (C) 1995-2014, International Business Machines
+Copyright (c) 2000-2004 IBM, Inc. and Others.
+Copyright (c) 2002-2014, International Business Machines Corporation and
+Copyright (c) 2007-2013, International Business Machines Corporation and
+Copyright (c) 2002-2012, International Business Machines Corporation and
+Copyright (C) 2002-2012, International Business Machines
+Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others.
+Copyright (c) 2002, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 2009-2014, International Business Machines
+Copyright (C) 2008, International Business Machines Corporation and others.
+Copyright (C) 2000-2016, International Business Machines
+Copyright (C) 2011-2014 International Business Machines
+Copyright (C) 1997-2014, International Business Machines
+Copyright (C) 1997-2013, International Business Machines
+Copyright (c) 2004-2006, International Business Machines
+Copyright (C) 1997-2016, International Business Machines
+Copyright (C) 1997-2006, International Business Machines
+Copyright (C) 1997-2011, International Business Machines Corporation and others.
+Copyright (C) 1997-2013, International Business Machines Corporation and others.
+Copyright (c) 2004-2015, International Business Machines
+Copyright (C) 2009-2017, International Business Machines Corporation,Google, and others. All Rights Reserved.
+Copyright (C) 1997-2016, International Business Machines Corporation and others.
+Copyright (C) 2008-2015, International Business Machines Corporation and
+Copyright (C) 1997-2015, International Business Machines Corporation and others.
+Copyright (C) 2014-2016, International Business Machines Corporation and others.
+Copyright (c) 2014-2016, International Business Machines
+Copyright (C) 2001-2011 IBM and others. All rights reserved.
+Copyright (C) 1996-2014, International Business Machines Corporation and others.
+Copyright (C) 1996-2016, International Business Machines Corporation and
+Copyright (C) 2009-2016, International Business Machines Corporation,
+Copyright (C) 2009-2010, Google, International Business Machines Corporation and
+Copyright (C) 2008-2014, Google, International Business Machines Corporation
+Copyright (C) 1996-2015, International Business Machines Corporation and
+Copyright (c) 1996-2015, International Business Machines Corporation and others.
+Copyright (C) 2010-2012,2015 International Business Machines
+Copyright (C) 2007-2015, International Business Machines
+Copyright (C) 2013-2014, International Business Machines Corporation and others.
+Copyright (C) 2010-2013, International Business Machines
+Copyright (c) 2002-2005, International Business Machines Corporation
+Copyright (C) 2001-2011,2014 IBM and others. All rights reserved.
+Copyright (C) 2008-2016, International Business Machines Corporation
+Copyright (C) 2004 - 2008, International Business Machines Corporation and
+Copyright (C) 1997-2011,2014-2015 International Business Machines
+Copyright (C) 2001-2003, International Business Machines
+Copyright (C) 1999-2009, International Business Machines
+Copyright (C) 2020 and later: Unicode, Inc. and others.
+Copyright (c) 2002, International Business Machines Corporation and
+Copyright (C) 2000-2008, International Business Machines
+Copyright (C) 1998-2006, International Business Machines
+Copyright (C) 1998-2001, International Business Machines Corporation
+Copyright (C) 1998-2004, International Business Machines Corporation
+Copyright (C) 2000, International Business Machines
+Copyright (c) 1999-2016, International Business Machines Corporation and
+Copyright (c) 2015, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (c) 1999-2012, International Business Machines Corporation and
+Copyright (C) 1998-2011, International Business Machines
+Copyright (C) 2008-2014, International Business Machines Corporation and
+Copyright (C) 2003-2004, International Business Machines
+Copyright (c) 2003-2005, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 2002-2006 IBM, Inc. All Rights Reserved.
+Copyright (C) 2004-2008, International Business Machines
+Copyright (c) 2002-2016 International Business Machines Corporation and
+Copyright (c) 2002-2015, International Business Machines Corporation and
+Copyright (C) 2002-2016, International Business Machines Corporation
+Copyright (c) 2002-2010,International Business Machines
+Copyright (c) 2002-2014,International Business Machines
+Copyright (c) 2002-2016,International Business Machines
+Copyright (C) 2016 International Business Machines Corporation
+Copyright © 2019 and later: Unicode, Inc. and others.
+Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (c) 2016 International Business Machines Corporation and others. All Rights Reserved.
+Copyright (c) 2015-2016, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (c) 2005-2006, International Business Machines Corporation and
+Copyright (c) 1997-2004, International Business Machines Corporation
+Copyright (c) 2012-2016, International Business Machines Corporation
+Copyright (c) 2012-2014, International Business Machines Corporation and
+Copyright (c) 1997-2014, International Business Machines Corporation
+Copyright (c) 1996-2016, International Business Machines Corporation and
+Copyright (c) 2003-2013, International Business Machines Corporation
+Copyright (c) 2003-2008, International Business Machines Corporation
+Copyright (c) 1997-2015, International Business Machines Corporation
+Copyright (c) 2002-2016, International Business Machines Corporation and
+Copyright (c) 1997-2002, International Business Machines Corporation and
+Copyright (C) 1996-2012, International Business Machines
+Copyright (c) 1997-2013 International Business Machines Corporation and
+Copyright (c) 2010-2012, International Business Machines Corporation and
+Copyright (c) 1997-2011, International Business Machines Corporation
+Copyright (c) 1997-2006, International Business Machines Corporation and
+Copyright (c) 2008-2016 International Business Machines Corporation and
+Copyright (c) 2008-2016, International Business Machines Corporation and
+Copyright (c) 1997-2016 International Business Machines Corporation and
+Copyright (c) 2007-2011, International Business Machines
+Copyright (c) 2007-2010, International Business Machines
+Copyright (C) 2001-2016, International Business Machines Corporation and
+Copyright (C) 2001-2003, International Business Machines Corporation and
+Copyright (C) 2003-2011, International Business Machines
+Copyright (c) 1997-2007, International Business Machines Corporation and
+Copyright (c) 1997-2015, International Business Machines
+Copyright (C) 2004-2009, International Business Machines Corporation and
+Copyright (C) 2004, International Business Machines Corporation and
+Copyright (C) 1996-2009, International Business Machines Corporation and
+Copyright (C) 1996-2006, International Business Machines Corporation and
+Copyright (C) 2011-2013, International Business Machines Corporation
+Copyright (C) 2000-2007, International Business Machines
+Copyright (c) 2001, International Business Machines Corporation and
+Copyright (C) 2012-2013, International Business Machines
+Copyright (c) 2010-2016, International Business Machines Corporation and
+Copyright (c) 2010-2016, International Business Machines Corporation
+Copyright (c) 1997-2010, International Business Machines Corporation
+Copyright (c) 1997-2003, International Business Machines
+Copyright (C) 2014-2015, International Business Machines Corporation and
+Copyright (c) 1997-2013, International Business Machines Corporation
+Copyright (c) 1999-2016, International Business Machines
+Copyright (c) 1999-2016 International Business Machines Corporation and
+Copyright (c) 2016, International Business Machines Corporation and
+Copyright (c) 2016, International Business Machines
+Copyright (c) 2013-2016, International Business Machines Corporation
+Copyright (c) 2013, International Business Machines Corporation
+Copyright (C) 2013-2016, International Business Machines Corporation and
+Copyright (c) 2001-2010, International Business Machines Corporation and
+Copyright (C) 2014, International Business Machines Corporation and
+Copyright (c) 1999-2015, International Business Machines Corporation and
+Copyright (C) 2001-2016, International Business Machines orporation
+Copyright (c) 2001-2008, International Business Machines Corporation and others
+Copyright (C) 2003-2016, International Business Machines Corporation and
+Copyright (c) 2004, International Business Machines Corporation
+Copyright (C) 2001-2009, International Business Machines
+Copyright (c) 2004,2011 International Business Machines
+Copyright (c) 2004-2011, International Business Machines
+Copyright (c) 2000-2016, International Business Machines Corporation
+Copyright (c) 2001-2005, International Business Machines Corporation and
+Copyright (C) 2001-2004, International Business Machines
+Copyright (c) 2001-2009, International Business Machines
+Copyright (c) 1997-2009, International Business Machines Corporation
+Copyright (c) 1997-2013, International Business Machines
+Copyright (c) 1997-2012, International Business Machines Corporation
+Copyright (C) 2007-2015, International Business Machines Corporation and
+Copyright (C) 2007-2011, International Business Machines Corporation and
+Copyright (C) 2007, International Business Machines Corporation and
+Copyright (c) 1998-2005, International Business Machines Corporation and
+Copyright (c) 2002-2010, International Business Machines Corporation and
+Copyright (C) 1999-2016 International Business Machines Corporation and
+Copyright (c) 2004-2011, International Business Machines Corporation and
+Copyright (c) 2002-2007, International Business Machines Corporation and
+Copyright (C) 2003, International Business Machines Corporation and
+Copyright (C) 2005-2011, International Business Machines
+Copyright (C) 2011-2012, International Business Machines
+Copyright (C) 2007-2012, International Business Machines
+Copyright (C) 2006-2016, International Business Machines Corporation
+Copyright (C) 2006-2012, International Business Machines Corporation and others.
+Copyright 2007 Google Inc. All Rights Reserved.
+Copyright (c) 2001-2015, International Business Machines
+Copyright (C) 2006-2014, International Business Machines Corporation
+Copyright (C) 2008, International Business Machines Corporation and
+Copyright (C) 2009-2012, International Business Machines
+Copyright (C) 2006 International Business Machines Corporation
+Copyright (C) 2010-2016, International Business Machines Corporation and
+Copyright (C) 2002-2014, International Business Machines Corporation and
+Copyright (C) 2002-2005, International Business Machines Corporation and
+Copyright (C) 2011, International Business Machines
+Copyright (c) 2003-2010 International Business Machines
+Copyright (C) 2003-2003, International Business Machines
+Copyright (C) 1999-2016 International Business Machines Corporation
+Copyright (C) 1999-2014 International Business Machines Corporation
+Copyright (C) 1999-2014 International Business Machines
+Copyright (C) 2002-2011, International Business Machines Corporation and others.
+Copyright (C) 2002-2008, International Business Machines Corporation and others.
+Copyright (C) 2002-2008 International Business Machines Corporation
+Copyright (c) 2001-2005, International Business Machines
+Copyright (C) 2002-2014 International Business Machines Corporation
+Copyright (c) 2003-2011, International Business Machines
+Copyright (C) 1998-2012, International Business Machines Corporation and
+Copyright (C) 2001-2014, International Business Machines Corporation.
+Copyright (C) 2001-2011, International Business Machines Corporation.
+Copyright (C) 2001-2014, International Business Machines Corporation and
+Copyright (C) 2001-2011, International Business Machines Corporation and
+Copyright (C) 2001-2012, International Business Machines Corporation and
+Copyright 2004 and onwards Google Inc.
+Copyright (C) 2004-2014, International Business Machines
+Copyright (C) 2006, International Business Machines
+Copyright (C) 2004-2012, International Business Machines
+Copyright (C) 2001-2013, International Business Machines
+Copyright (C) 1998-2004, International Business Machines
+Copyright (C) 2000-2013, International Business Machines
+Copyright (C) 1999-2015 International Business Machines
+Copyright (C) 2000-2006, International Business Machines
+Copyright (C) 1999-2004, International Business Machines
+Copyright (C) 2003-2007, International Business Machines
+Copyright (C) 2002-2006, International Business Machines
+Copyright (C) 2001-2015, International Business Machines
+Copyright (c) 2001-2012, International Business Machines
+Copyright (c) 2002-2004, International Business Machines
+Copyright (C) 1999-2016, International Business Machines Corporation and
+Copyright (c) 1996-2014, International Business Machines
+Copyright (C) 1999-2016, International Business Machines Corporation
+Copyright (C) 2009-2014 International Business Machines
+Copyright (C) 2004-2007, International Business Machines
+Copyright (c) 2001-2016, International Business Machines
+Copyright (C) 2003-2009, International Business Machines
+Copyright (C) 1999-2013, International Business Machines Corporation and
+Copyright (C) 1999-2015, International Business Machines Corporation and
+Copyright (c) 2002-2011, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 2001-2016 IBM, Inc. All Rights Reserved.
+Copyright (C) 1999-2016 International Business Machines
+Copyright (C) 2009-2010 IBM Corporation and Others. All Rights Reserved.
+Copyright (C) 1998-2012, International Business Machines
+Copyright (C) 1991 and later: Unicode, Inc. and others.
+Copyright (C) 1997-2000, International Business Machines
+Copyright (c) 1999-2007, International Business Machines Corporation and
+Copyright (c) 2000 IBM, Inc. and Others.
+Copyright (C) 2008-2013, International Business Machines
+Copyright (C) 1998-2003, 2006, International Business Machines Corporation
+Copyright (c) 2002-2003,International Business Machines
+Copyright (C) 2009 International Business Machines
+Copyright (C) 2010-2016 International Business Machines
+Copyright (C) 2008-2012 IBM, Inc. All Rights Reserved.
+Copyright (C) 1998-2008, International Business Machines
+Copyright (C) 2010-2016, International Business Machines
+Copyright (C) 1999-2006,2013 IBM Corp. All rights reserved.
+Copyright (C) 2008-2009, International Business Machines Corporation and
+Copyright (C) 2012,2014 International Business Machines
+Copyright (c) 1996-2015, International Business Machines Corporation and
+Copyright (C) 1997-2005, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 1999-2012, International Business Machines Corporation and
+Copyright (C) 1996-2013, International Business Machines Corporation
+Copyright (C) 1998-2005, International Business Machines
+Copyright 2001 and onwards Google Inc.
+Copyright (C) 2010-2012,2014, International Business Machines
+Copyright (C) 1996-2015, International Business Machines Corporation and others.
+Copyright (c) 2003-2004, International Business Machines
+Copyright (C) 2000-2004, International Business Machines
+Copyright (C) 2002-2013, International Business Machines
+Copyright (C) 2002-2011 International Business Machines Corporation and others. All Rights Reserved.
+Copyright (C) 1999-2010, International Business Machines Corporation and others.
+Copyright (C) 2001-2005, International Business Machines Corporation and others. All Rights Reserved.
+Copyright (c) 1996-2016, International Business Machines Corporation
+Copyright (C) 1997-2010, International Business Machines
+
 Software: opencv 4.2.0
 Copyright notice:
 Copyright (C) 2016, NVIDIA Corporation, all rights reserved.
--- a/cmake/external_libs/icu4c.cmake
+++ b/cmake/external_libs/icu4c.cmake
@ -0,0 +1,19 @@
+set(LIB_ICU_COMMON icuuc)
+set(LIB_ICU_DATA icudata)
+set(LIB_ICU_I18N icui18n)
+if (CMAKE_SYSTEM_NAME MATCHES "Windows")
+    message("icu4c thirdparty do not support windows currently.")
+else()
+    mindspore_add_pkg(icu4c
+            VER 67.1
+            LIBS ${LIB_ICU_COMMON} ${LIB_ICU_DATA} ${LIB_ICU_I18N}
+            URL https://github.com/unicode-org/icu/archive/release-67-1.tar.gz
+            MD5 0c2662a2b0bc80b0eb56495205247c8f
+            CONFIGURE_COMMAND ./icu4c/source/runConfigureICU Linux --enable-tests=no --enable-samples=no --enable-icuio=no --enable-extras=no ICU_DATA_FILTER_FILE=${CMAKE_SOURCE_DIR}/third_party/icu4c/filter.json
+            )
+    include_directories(${icu4c_INC})
+    add_library(mindspore::icuuc ALIAS icu4c::${LIB_ICU_COMMON})
+    add_library(mindspore::icudata ALIAS icu4c::${LIB_ICU_DATA})
+    add_library(mindspore::icui18n ALIAS icu4c::${LIB_ICU_I18N})
+    add_definitions(-D ENABLE_ICU4C)
+endif()
--- a/cmake/mind_expression.cmake
+++ b/cmake/mind_expression.cmake
@ -54,6 +54,7 @@ elseif(ENABLE_D OR ENABLE_TESTCASES)
 endif()

 if (ENABLE_MINDDATA)
+    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/icu4c.cmake)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/jpeg_turbo.cmake)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/libtiff.cmake)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/opencv.cmake)
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@ -91,7 +91,20 @@ if (ENABLE_MINDDATA)
        DESTINATION ${INSTALL_LIB_DIR}
        COMPONENT mindspore
    )
-
+    if (CMAKE_SYSTEM_NAME MATCHES "Windows")
+        message("icu4c does not support windows system temporarily")
+    else()
+        file(GLOB_RECURSE ICU4C_LIB_LIST
+            ${icu4c_LIBPATH}/libicuuc*
+            ${icu4c_LIBPATH}/libicudata*
+            ${icu4c_LIBPATH}/libicui18n*
+        )
+        install(
+            FILES ${ICU4C_LIB_LIST}
+            DESTINATION ${INSTALL_LIB_DIR}
+            COMPONENT mindspore
+        )
+    endif()
 endif ()

 if (ENABLE_CPU)
--- a/mindspore/ccsrc/dataset/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/CMakeLists.txt
@ -108,10 +108,11 @@ target_link_libraries(_c_dataengine PRIVATE mindspore mindspore_gvar)
 if (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
    target_link_libraries(_c_dataengine PRIVATE mindspore::pybind11_module ${PYTHON_LIBRARIES} mindspore::protobuf ${SECUREC_LIBRARY})
 else()
+    set(ICU_LIB mindspore::icuuc mindspore::icudata mindspore::icui18n)
    target_link_libraries(_c_dataengine PRIVATE mindspore::pybind11_module -ldl mindspore::protobuf ${SECUREC_LIBRARY})
 endif()
 target_link_libraries(_c_dataengine PUBLIC mindspore::jpeg_turbo mindspore::opencv_core mindspore::opencv_imgcodecs
-        mindspore::opencv_imgproc mindspore::tinyxml2)
+        mindspore::opencv_imgproc mindspore::tinyxml2  ${ICU_LIB})
 if (ENABLE_GPUQUE)
    target_link_libraries(_c_dataengine PRIVATE gpu_queue
                                     ${CUDNN_PATH}/lib64/libcudnn.so
--- a/mindspore/ccsrc/dataset/api/python_bindings.cc
+++ b/mindspore/ccsrc/dataset/api/python_bindings.cc
@ -65,8 +65,21 @@
 #include "dataset/text/kernels/jieba_tokenizer_op.h"
 #include "dataset/text/kernels/ngram_op.h"
 #include "dataset/text/kernels/unicode_char_tokenizer_op.h"
+#include "dataset/text/kernels/wordpiece_tokenizer_op.h"
 #include "dataset/text/vocab.h"
 #include "dataset/text/kernels/lookup_op.h"
+
+#ifdef ENABLE_ICU4C
+#include "dataset/text/kernels/basic_tokenizer_op.h"
+#include "dataset/text/kernels/bert_tokenizer_op.h"
+#include "dataset/text/kernels/case_fold_op.h"
+#include "dataset/text/kernels/normalize_utf8_op.h"
+#include "dataset/text/kernels/regex_replace_op.h"
+#include "dataset/text/kernels/regex_tokenizer_op.h"
+#include "dataset/text/kernels/unicode_script_tokenizer_op.h"
+#include "dataset/text/kernels/whitespace_tokenizer_op.h"
+#endif
+
 #include "dataset/util/random.h"
 #include "mindrecord/include/shard_operator.h"
 #include "mindrecord/include/shard_pk_sample.h"
@ -485,7 +498,7 @@ void bindTensorOps4(py::module *m) {
         py::arg("fillR") = PadOp::kDefFillR, py::arg("fillG") = PadOp::kDefFillG, py::arg("fillB") = PadOp::kDefFillB);
 }

-void bindTensorOps5(py::module *m) {
+void bindTokenizerOps(py::module *m) {
  (void)py::class_<JiebaTokenizerOp, TensorOp, std::shared_ptr<JiebaTokenizerOp>>(*m, "JiebaTokenizerOp", "")
    .def(py::init<const std::string, std::string, JiebaMode>(), py::arg("hmm_path"), py::arg("mp_path"),
         py::arg("mode") = JiebaMode::kMix)
@ -503,6 +516,55 @@ void bindTensorOps5(py::module *m) {
                  const std::string &>(),
         py::arg("ngrams"), py::arg("l_pad_len"), py::arg("r_pad_len"), py::arg("l_pad_token"), py::arg("r_pad_token"),
         py::arg("separator"));
+  (void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>(
+    *m, "WordpieceTokenizerOp", "Tokenize scalar token or 1-D tokens to subword tokens.")
+    .def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &>(),
+         py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
+         py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
+         py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken));
+}
+
+void bindDependIcuTokenizerOps(py::module *m) {
+#ifdef ENABLE_ICU4C
+  (void)py::class_<WhitespaceTokenizerOp, TensorOp, std::shared_ptr<WhitespaceTokenizerOp>>(
+    *m, "WhitespaceTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces.")
+    .def(py::init<>());
+  (void)py::class_<UnicodeScriptTokenizerOp, TensorOp, std::shared_ptr<UnicodeScriptTokenizerOp>>(
+    *m, "UnicodeScriptTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.")
+    .def(py::init<>())
+    .def(py::init<bool>(), py::arg("keep_whitespace") = UnicodeScriptTokenizerOp::kDefKeepWhitespace);
+  (void)py::class_<CaseFoldOp, TensorOp, std::shared_ptr<CaseFoldOp>>(
+    *m, "CaseFoldOp", "Apply case fold operation on utf-8 string tensor")
+    .def(py::init<>());
+  (void)py::class_<NormalizeUTF8Op, TensorOp, std::shared_ptr<NormalizeUTF8Op>>(
+    *m, "NormalizeUTF8Op", "Apply normalize operation on utf-8 string tensor.")
+    .def(py::init<>())
+    .def(py::init<NormalizeForm>(), py::arg("normalize_form") = NormalizeUTF8Op::kDefNormalizeForm);
+  (void)py::class_<RegexReplaceOp, TensorOp, std::shared_ptr<RegexReplaceOp>>(
+    *m, "RegexReplaceOp", "Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'.")
+    .def(py::init<const std::string &, const std::string &, bool>(), py::arg("pattern"), py::arg("replace"),
+         py::arg("replace_all"));
+  (void)py::class_<RegexTokenizerOp, TensorOp, std::shared_ptr<RegexTokenizerOp>>(
+    *m, "RegexTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by regex expression pattern.")
+    .def(py::init<const std::string &, const std::string &>(), py::arg("delim_pattern"), py::arg("keep_delim_pattern"));
+  (void)py::class_<BasicTokenizerOp, TensorOp, std::shared_ptr<BasicTokenizerOp>>(
+    *m, "BasicTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by specific rules.")
+    .def(py::init<bool, bool, NormalizeForm, bool>(), py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
+         py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
+         py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
+         py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken);
+  (void)py::class_<BertTokenizerOp, TensorOp, std::shared_ptr<BertTokenizerOp>>(*m, "BertTokenizerOp",
+                                                                                "Tokenizer used for Bert text process.")
+    .def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, bool, bool,
+                  NormalizeForm, bool>(),
+         py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
+         py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
+         py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken),
+         py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
+         py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
+         py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
+         py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken);
+#endif
 }

 void bindSamplerOps(py::module *m) {
@ -715,6 +777,16 @@ PYBIND11_MODULE(_c_dataengine, m) {
    .value("DE_JIEBA_HMM", JiebaMode::kHmm)
    .export_values();

+#ifdef ENABLE_ICU4C
+  (void)py::enum_<NormalizeForm>(m, "NormalizeForm", py::arithmetic())
+    .value("DE_NORMALIZE_NONE", NormalizeForm::kNone)
+    .value("DE_NORMALIZE_NFC", NormalizeForm::kNfc)
+    .value("DE_NORMALIZE_NFKC", NormalizeForm::kNfkc)
+    .value("DE_NORMALIZE_NFD", NormalizeForm::kNfd)
+    .value("DE_NORMALIZE_NFKD", NormalizeForm::kNfkd)
+    .export_values();
+#endif
+
  (void)py::enum_<InterpolationMode>(m, "InterpolationMode", py::arithmetic())
    .value("DE_INTER_LINEAR", InterpolationMode::kLinear)
    .value("DE_INTER_CUBIC", InterpolationMode::kCubic)
@ -734,12 +806,13 @@ PYBIND11_MODULE(_c_dataengine, m) {
  bindTensorOps2(&m);
  bindTensorOps3(&m);
  bindTensorOps4(&m);
-  bindTensorOps5(&m);
+  bindTokenizerOps(&m);
  bindSamplerOps(&m);
  bindDatasetOps(&m);
  bindInfoObjects(&m);
  bindVocabObjects(&m);
  bindGraphData(&m);
+  bindDependIcuTokenizerOps(&m);
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/dataset/text/kernels/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/text/kernels/CMakeLists.txt
@ -1,8 +1,21 @@
 file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
 set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
+if (NOT (CMAKE_SYSTEM_NAME MATCHES "Windows"))
+        set(ICU_DEPEND_FILES
+                basic_tokenizer_op.cc
+                bert_tokenizer_op.cc
+                case_fold_op.cc
+                normalize_utf8_op.cc
+                regex_replace_op.cc
+                regex_tokenizer_op.cc
+                unicode_script_tokenizer_op.cc
+                whitespace_tokenizer_op.cc)
+endif()
 add_library(text-kernels OBJECT
        lookup_op.cc
        jieba_tokenizer_op.cc
        unicode_char_tokenizer_op.cc
        ngram_op.cc
+        wordpiece_tokenizer_op.cc
+        ${ICU_DEPEND_FILES}
        )
--- a/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc
@ -0,0 +1,93 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/basic_tokenizer_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace mindspore {
+namespace dataset {
+const bool BasicTokenizerOp::kDefLowerCase = false;
+const bool BasicTokenizerOp::kDefKeepWhitespace = false;
+const NormalizeForm BasicTokenizerOp::kDefNormalizationForm = NormalizeForm::kNone;
+const bool BasicTokenizerOp::kDefPreserveUnusedToken = true;
+const char BasicTokenizerOp::kCommonPattern[] =
+  "[!-/]"
+  "|[:-@]"
+  "|[\\[-`]"
+  "|[{-~]"
+  "|[\\p{P}]"
+  "|[\\x{4E00}-\\x{9FFF}]"
+  "|[\\x{3400}-\\x{4DBF}]"
+  "|[\\x{20000}-\\x{2A6DF}]"
+  "|[\\x{2A700}-\\x{2B73F}]"
+  "|[\\x{2B740}-\\x{2B81F}]"
+  "|[\\x{2B820}-\\x{2CEAF}]"
+  "|[\\x{F900}-\\x{FAFF}]"
+  "|[\\x{2F800}-\\x{2FA1F}]";
+const char BasicTokenizerOp::kUnusedPattern[] = "\\[CLS\\]|\\[SEP\\]|\\[UNK\\]|\\[PAD\\]|\\[MASK\\]|";
+
+BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, NormalizeForm normalization_form,
+                                   bool preserve_unused_token)
+    : lower_case_(lower_case),
+      keep_whitespace_(keep_whitespace),
+      preserve_unused_token_(preserve_unused_token),
+      case_fold_(std::make_unique<CaseFoldOp>()),
+      nfd_normalize_(std::make_unique<NormalizeUTF8Op>(NormalizeForm::kNfd)),
+      common_normalize_(std::make_unique<NormalizeUTF8Op>(normalization_form)),
+      replace_accent_chars_(std::make_unique<RegexReplaceOp>("\\p{Mn}", "")),
+      replace_control_chars_(std::make_unique<RegexReplaceOp>("\\p{Cc}|\\p{Cf}", " ")) {
+  std::string delim_pattern = std::string("\\s+|") + kCommonPattern;
+  std::string keep_delim_pattern;
+  if (keep_whitespace_) {
+    keep_delim_pattern = delim_pattern;
+  } else {
+    keep_delim_pattern = kCommonPattern;
+  }
+  if (preserve_unused_token_) {
+    keep_delim_pattern = kUnusedPattern + keep_delim_pattern;
+    delim_pattern = kUnusedPattern + delim_pattern;
+  }
+  regex_tokenizer_ = std::make_unique<RegexTokenizerOp>(delim_pattern, keep_delim_pattern);
+}
+
+Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
+    RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
+  }
+  std::shared_ptr<Tensor> cur_input;
+  std::shared_ptr<Tensor> processed_tensor;
+  if (lower_case_) {
+    // to lower case
+    RETURN_IF_NOT_OK(case_fold_->Compute(input, &processed_tensor));
+    cur_input = processed_tensor;
+    // strip accent characters
+    RETURN_IF_NOT_OK(nfd_normalize_->Compute(cur_input, &processed_tensor));
+    cur_input = processed_tensor;
+    RETURN_IF_NOT_OK(replace_accent_chars_->Compute(cur_input, &processed_tensor));
+  } else {
+    RETURN_IF_NOT_OK(common_normalize_->Compute(input, &processed_tensor));
+  }
+  // strip control characters
+  cur_input = processed_tensor;
+  RETURN_IF_NOT_OK(replace_control_chars_->Compute(cur_input, &processed_tensor));
+  return regex_tokenizer_->Compute(processed_tensor, output);
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h
@ -0,0 +1,64 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
+#define DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
+#include <memory>
+#include <string>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/text/kernels/case_fold_op.h"
+#include "dataset/text/kernels/normalize_utf8_op.h"
+#include "dataset/text/kernels/regex_replace_op.h"
+#include "dataset/text/kernels/regex_tokenizer_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+class BasicTokenizerOp : public TensorOp {
+ public:
+  static const bool kDefLowerCase;
+  static const bool kDefKeepWhitespace;
+  static const NormalizeForm kDefNormalizationForm;
+  static const bool kDefPreserveUnusedToken;
+  BasicTokenizerOp(bool lower_case = kDefLowerCase, bool keep_whitespace = kDefKeepWhitespace,
+                   NormalizeForm normalization_form = kDefNormalizationForm,
+                   bool preserve_unused_token = kDefPreserveUnusedToken);
+
+  ~BasicTokenizerOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "BasicTokenizerOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ private:
+  static const char kCommonPattern[];
+  static const char kUnusedPattern[];
+  bool lower_case_;
+  bool keep_whitespace_;
+  NormalizeForm normalization_form_;
+  bool preserve_unused_token_;
+  std::unique_ptr<CaseFoldOp> case_fold_;
+  std::unique_ptr<NormalizeUTF8Op> nfd_normalize_;
+  std::unique_ptr<NormalizeUTF8Op> common_normalize_;
+  std::unique_ptr<RegexReplaceOp> replace_accent_chars_;
+  std::unique_ptr<RegexReplaceOp> replace_control_chars_;
+  std::unique_ptr<RegexTokenizerOp> regex_tokenizer_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
--- a/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.cc
@ -0,0 +1,27 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/bert_tokenizer_op.h"
+namespace mindspore {
+namespace dataset {
+Status BertTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  std::shared_ptr<Tensor> basic_tensor;
+  RETURN_IF_NOT_OK(basic_tokenizer_.Compute(input, &basic_tensor));
+  RETURN_IF_NOT_OK(wordpiece_tokenizer_.Compute(basic_tensor, output));
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.h
@ -0,0 +1,54 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_
+#define DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_
+#include <memory>
+#include <string>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/text/kernels/basic_tokenizer_op.h"
+#include "dataset/text/kernels/wordpiece_tokenizer_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+class BertTokenizerOp : public TensorOp {
+ public:
+  BertTokenizerOp(const std::shared_ptr<Vocab> &vocab,
+                  const std::string &suffix_indicator = WordpieceTokenizerOp::kDefSuffixIndicator,
+                  const int &max_bytes_per_token = WordpieceTokenizerOp::kDefMaxBytesPerToken,
+                  const std::string &unknown_token = WordpieceTokenizerOp::kDefUnknownToken,
+                  bool lower_case = BasicTokenizerOp::kDefLowerCase,
+                  bool keep_whitespace = BasicTokenizerOp::kDefKeepWhitespace,
+                  NormalizeForm normalization_form = BasicTokenizerOp::kDefNormalizationForm,
+                  bool preserve_unused_token = BasicTokenizerOp::kDefPreserveUnusedToken)
+      : wordpiece_tokenizer_(vocab, suffix_indicator, max_bytes_per_token, unknown_token),
+        basic_tokenizer_(lower_case, keep_whitespace, normalization_form, preserve_unused_token) {}
+
+  ~BertTokenizerOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "BertTokenizerOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ private:
+  WordpieceTokenizerOp wordpiece_tokenizer_;
+  BasicTokenizerOp basic_tokenizer_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_
--- a/mindspore/ccsrc/dataset/text/kernels/case_fold_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/case_fold_op.cc
@ -0,0 +1,46 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/case_fold_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "unicode/errorcode.h"
+#include "unicode/normalizer2.h"
+#include "unicode/utypes.h"
+
+namespace mindspore {
+namespace dataset {
+
+Status CaseFoldOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  icu::ErrorCode error;
+  const icu::Normalizer2 *nfkc_case_fold = icu::Normalizer2::getNFKCCasefoldInstance(error);
+  CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKCCasefoldInstance failed.");
+  std::vector<std::string> strs(input->Size());
+  int i = 0;
+  for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
+    icu::StringByteSink<std::string> sink(&strs[i++]);
+    nfkc_case_fold->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error);
+    CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "normalizeUTF8 failed.");
+  }
+  *output = std::make_shared<Tensor>(std::move(strs), input->shape());
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/text/kernels/case_fold_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/case_fold_op.h
@ -0,0 +1,39 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_
+#define DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_
+#include <memory>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+class CaseFoldOp : public TensorOp {
+ public:
+  CaseFoldOp() {}
+
+  ~CaseFoldOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "CaseFoldOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_
--- a/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc
@ -29,6 +29,7 @@ JiebaTokenizerOp::JiebaTokenizerOp(const std::string &hmm_path, const std::strin
 }

 Status JiebaTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
  RETURN_UNEXPECTED_IF_NULL(jieba_parser_);

  if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
--- a/mindspore/ccsrc/dataset/text/kernels/lookup_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/lookup_op.cc
@ -24,6 +24,7 @@ LookupOp::LookupOp(std::shared_ptr<Vocab> vocab, WordIdType default_id)
    : vocab_(vocab), default_id_(default_id), type_(DataType("int32")) {}

 Status LookupOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
  RETURN_UNEXPECTED_IF_NULL(vocab_);
  CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "None String Tensor");
  std::vector<WordIdType> word_ids;
--- a/mindspore/ccsrc/dataset/text/kernels/ngram_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/ngram_op.cc
@ -34,6 +34,7 @@ NgramOp::NgramOp(const std::vector<int32_t> &ngrams, int32_t l_len, int32_t r_le
      separator_(separator) {}

 Status NgramOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
  CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING && input->Rank() == 1, "Not a 1-D str Tensor");
  std::vector<int32_t> offsets;                 // offsets for each str
  std::vector<std::string> res;                 // holds the result of ngrams
--- a/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.cc
@ -0,0 +1,75 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/normalize_utf8_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "unicode/errorcode.h"
+#include "unicode/normalizer2.h"
+#include "unicode/utypes.h"
+
+namespace mindspore {
+namespace dataset {
+const NormalizeForm NormalizeUTF8Op::kDefNormalizeForm = NormalizeForm::kNfkc;
+Status NormalizeUTF8Op::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  icu::ErrorCode error;
+  const icu::Normalizer2 *normalize = nullptr;
+  switch (normalize_form_) {
+    case NormalizeForm::kNone: {
+      *output = input;
+      return Status::OK();
+    }
+    case NormalizeForm::kNfc: {
+      normalize = icu::Normalizer2::getNFCInstance(error);
+      CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFCInstance failed");
+      break;
+    }
+    case NormalizeForm::kNfkc: {
+      normalize = icu::Normalizer2::getNFKCInstance(error);
+      CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKCInstance failed");
+      break;
+    }
+    case NormalizeForm::kNfd: {
+      normalize = icu::Normalizer2::getNFDInstance(error);
+      CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFDInstance failed");
+      break;
+    }
+    case NormalizeForm::kNfkd: {
+      normalize = icu::Normalizer2::getNFKDInstance(error);
+      CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKDInstance failed");
+      break;
+    }
+    default: {
+      RETURN_STATUS_UNEXPECTED("unexpected normalize form");
+      break;
+    }
+  }
+  std::vector<std::string> strs(input->Size());
+  int i = 0;
+  for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
+    icu::StringByteSink<std::string> sink(&strs[i++]);
+    normalize->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error);
+    CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "normalizeUTF8 failed.");
+  }
+  *output = std::make_shared<Tensor>(std::move(strs), input->shape());
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/normalize_utf8_op.h
@ -0,0 +1,50 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_
+#define DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_
+#include <memory>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+enum class NormalizeForm {
+  kNone = 0,
+  kNfc,
+  kNfkc,
+  kNfd,
+  kNfkd,
+};
+
+class NormalizeUTF8Op : public TensorOp {
+ public:
+  static const NormalizeForm kDefNormalizeForm;
+  explicit NormalizeUTF8Op(NormalizeForm normalize_form = kDefNormalizeForm) : normalize_form_(normalize_form) {}
+
+  ~NormalizeUTF8Op() override = default;
+
+  void Print(std::ostream &out) const override { out << "NormalizeUTF8Op"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ private:
+  NormalizeForm normalize_form_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_
--- a/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.cc
@ -0,0 +1,57 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/regex_replace_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace mindspore {
+namespace dataset {
+
+Status RegexReplaceOp::RegexReplace(icu::RegexMatcher *const matcher, const std::string_view &text,
+                                    std::string *out) const {
+  CHECK_FAIL_RETURN_UNEXPECTED((matcher != nullptr && out != nullptr), "Input is null");
+  UErrorCode icu_error = U_ZERO_ERROR;
+  icu::UnicodeString unicode_text = icu::UnicodeString::fromUTF8(text);
+  matcher->reset(unicode_text);
+  icu::UnicodeString unicode_out;
+  if (replace_all_) {
+    unicode_out = matcher->replaceAll(replace_, icu_error);
+  } else {
+    unicode_out = matcher->replaceFirst(replace_, icu_error);
+  }
+  CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "RegexReplace failed");
+  unicode_out.toUTF8String(*out);
+  return Status::OK();
+}
+
+Status RegexReplaceOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  UErrorCode icu_error = U_ZERO_ERROR;
+  icu::RegexMatcher matcher(pattern_, 0, icu_error);
+  CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "Create icu RegexMatcher failed, you may input one error pattern");
+  std::vector<std::string> strs(input->Size());
+  int i = 0;
+  for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
+    RETURN_IF_NOT_OK(RegexReplace(&matcher, *iter, &strs[i]));
+  }
+  *output = std::make_shared<Tensor>(std::move(strs), input->shape());
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/regex_replace_op.h
@ -0,0 +1,55 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_
+#define DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_
+#include <memory>
+#include <string>
+
+#include "unicode/regex.h"
+#include "unicode/errorcode.h"
+#include "unicode/utypes.h"
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+class RegexReplaceOp : public TensorOp {
+ public:
+  RegexReplaceOp(const std::string &pattern, const std::string &replace, bool replace_all = true)
+      : pattern_(icu::UnicodeString::fromUTF8(pattern)),
+        replace_(icu::UnicodeString::fromUTF8(replace)),
+        replace_all_(replace_all) {}
+
+  ~RegexReplaceOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "RegexReplaceOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ protected:
+  Status RegexReplace(icu::RegexMatcher *const matcher, const std::string_view &text, std::string *out) const;
+
+ private:
+  const icu::UnicodeString pattern_;
+  const icu::UnicodeString replace_;
+  const bool replace_all_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_
--- a/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.cc
@ -0,0 +1,103 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/regex_tokenizer_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace mindspore {
+namespace dataset {
+Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8,
+                                          icu::UnicodeString *out_unicode) const {
+  CHECK_FAIL_RETURN_UNEXPECTED((out_utf8 != nullptr || out_unicode != nullptr), "Wrong input");
+  int total_len = input.length();
+  int end = start + len;
+  CHECK_FAIL_RETURN_UNEXPECTED((start >= 0 && len > 0 && end <= total_len), "Out of range");
+  icu::UnicodeString temp;
+  input.extract(start, len, temp);
+  if (out_utf8 != nullptr) {
+    temp.toUTF8String(*out_utf8);
+  }
+  if (out_unicode != nullptr) {
+    *out_unicode = temp;
+  }
+  return Status::OK();
+}
+
+Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const {
+  UErrorCode status = U_ZERO_ERROR;
+  out_tokens->clear();
+  icu::RegexMatcher token_matcher(delim_pattern_, 0, status);
+  CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Create icu RegexMatcher failed, you may input one error pattern");
+  icu::RegexMatcher delim_matcher(keep_delim_pattern_, 0, status);
+  CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Create icu RegexMatcher failed, you may input one error pattern");
+
+  icu::UnicodeString utext(icu::UnicodeString::fromUTF8(text));
+  token_matcher.reset(utext);
+
+  int token_start_index = 0;
+  status = U_ZERO_ERROR;
+  while (token_matcher.find(status) && U_SUCCESS(status)) {
+    int deli_start_index = token_matcher.start(status);
+    CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Get RegexMatcher matched start index failed");
+    int deli_end_index = token_matcher.end(status);
+    CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Get RegexMatcher matched start index failed");
+
+    // Add non-empty token
+    int token_len = deli_start_index - token_start_index;
+    if (token_len > 0) {
+      std::string token;
+      RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, token_len, &token));
+      out_tokens->emplace_back(std::move(token));
+    }
+
+    int delim_len = deli_end_index - deli_start_index;
+    if (keep_delim_ && delim_len > 0) {
+      icu::UnicodeString delim_str;
+      std::string delim_utf8_str;
+      RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, deli_start_index, delim_len, &delim_utf8_str, &delim_str));
+      delim_matcher.reset(delim_str);
+      if (delim_matcher.matches(status) && U_SUCCESS(status)) {
+        out_tokens->emplace_back(std::move(delim_utf8_str));
+      }
+    }
+    token_start_index = deli_end_index;
+  }
+
+  if (token_start_index < utext.length()) {
+    std::string temp;
+    RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, utext.length() - token_start_index, &temp));
+    out_tokens->emplace_back(std::move(temp));
+  }
+  return Status::OK();
+}
+
+Status RegexTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
+    RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
+  }
+  std::string_view text;
+  RETURN_IF_NOT_OK(input->GetItemAt(&text, {}));
+  std::vector<std::string> tokens;
+  RETURN_IF_NOT_OK(GetRegexTokens(std::string(text.data(), text.size()), &tokens));
+  *output = std::make_shared<Tensor>(std::move(tokens), TensorShape({(dsize_t)tokens.size()}));
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.h
@ -0,0 +1,58 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_REGEX_TOKENIZER_OP_H_
+#define DATASET_TEXT_REGEX_TOKENIZER_OP_H_
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "unicode/regex.h"
+#include "unicode/errorcode.h"
+#include "unicode/utypes.h"
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+class RegexTokenizerOp : public TensorOp {
+ public:
+  RegexTokenizerOp(const std::string &delim_pattern, const std::string &keep_delim_pattern)
+      : delim_pattern_(icu::UnicodeString::fromUTF8(delim_pattern)),
+        keep_delim_pattern_(icu::UnicodeString::fromUTF8(keep_delim_pattern)),
+        keep_delim_(!keep_delim_pattern.empty()) {}
+
+  ~RegexTokenizerOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "RegexTokenizerOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ protected:
+  Status GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8,
+                          icu::UnicodeString *out_unicode = nullptr) const;
+  Status GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const;
+
+ private:
+  const icu::UnicodeString delim_pattern_;
+  const icu::UnicodeString keep_delim_pattern_;
+  const bool keep_delim_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_REGEX_TOKENIZER_OP_H_
--- a/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc
@ -28,6 +28,7 @@ namespace mindspore {
 namespace dataset {

 Status UnicodeCharTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
  if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
    RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
  }
--- a/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h
@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifndef DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_
-#define DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_
+#ifndef DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_
+#define DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_
 #include <memory>

 #include "dataset/core/tensor.h"
@ -37,4 +37,4 @@ class UnicodeCharTokenizerOp : public TensorOp {

 }  // namespace dataset
 }  // namespace mindspore
-#endif  // DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_
+#endif  // DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_
--- a/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.cc
@ -0,0 +1,93 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/unicode_script_tokenizer_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "cppjieba/Unicode.hpp"
+#include "unicode/errorcode.h"
+#include "unicode/uchar.h"
+#include "unicode/uscript.h"
+
+using cppjieba::DecodeRunesInString;
+using cppjieba::RuneStrArray;
+
+namespace mindspore {
+namespace dataset {
+
+const bool UnicodeScriptTokenizerOp::kDefKeepWhitespace = false;
+
+Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
+    RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
+  }
+  std::string_view str;
+  RETURN_IF_NOT_OK(input->GetItemAt(&str, {}));
+  RuneStrArray runes;
+  if (!DecodeRunesInString(str.data(), str.size(), runes)) {
+    RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
+  }
+
+  UScriptCode last_script = USCRIPT_INVALID_CODE;
+  icu::ErrorCode status;
+  int start = 0;
+  int len = 0;
+  std::vector<std::string> splits;
+
+  bool was_space = false;
+  for (size_t i = 0; i < runes.size(); i++) {
+    bool is_space = u_isUWhiteSpace(runes[i].rune);
+    UScriptCode script = uscript_getScript(runes[i].rune, status);
+    if (status.isFailure()) {
+      status.reset();
+      script = USCRIPT_INVALID_CODE;
+    }
+    // 1) Seperate UTF-8 strings of different UScriptCode values
+    //    (such as: "Chinese中国" should be splited to ["Chinese", "中国"])
+    // 2) Seperate whitespace and non-whitespace UTF-8 strings
+    //    (such as: " ." should be split to [" ", "."])
+    if (len > 0 && (script != last_script || is_space != was_space)) {
+      // 3) If keep_whitespace_ is false, all the whitespace characters will be discard
+      if (keep_whitespace_ || !was_space) {
+        std::string temp(str.substr(start, len));
+        splits.emplace_back(std::move(temp));
+      }
+      start = runes[i].offset;
+      len = runes[i].len;
+    } else {
+      len += runes[i].len;
+    }
+    last_script = script;
+    was_space = is_space;
+  }
+
+  if (len > 0 && (keep_whitespace_ || !was_space)) {
+    std::string temp(str.substr(start, len));
+    splits.emplace_back(std::move(temp));
+  }
+  // 4) If the input is empty scalar string, the output will be 1-D empty string.
+  if (splits.empty()) {
+    splits.emplace_back("");
+  }
+  *output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.h
@ -0,0 +1,44 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_UNICODE_SCRIPT_TOKENIZER_OP_H_
+#define DATASET_TEXT_KERNELS_UNICODE_SCRIPT_TOKENIZER_OP_H_
+#include <memory>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+class UnicodeScriptTokenizerOp : public TensorOp {
+ public:
+  static const bool kDefKeepWhitespace;
+
+  explicit UnicodeScriptTokenizerOp(bool keep_whitespace = kDefKeepWhitespace) : keep_whitespace_(keep_whitespace) {}
+
+  ~UnicodeScriptTokenizerOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "UnicodeScriptTokenizerOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ private:
+  bool keep_whitespace_;  // If or not keep whitespace tokens
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_UNICODE_SCRIPT_TOKENIZER_OP_H_
--- a/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.cc
@ -0,0 +1,73 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/text/kernels/whitespace_tokenizer_op.h"
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "cppjieba/Unicode.hpp"
+#include "unicode/errorcode.h"
+#include "unicode/uchar.h"
+#include "unicode/uscript.h"
+
+using cppjieba::DecodeRunesInString;
+using cppjieba::RuneStrArray;
+
+namespace mindspore {
+namespace dataset {
+Status WhitespaceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
+    RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
+  }
+  std::string_view str;
+  RETURN_IF_NOT_OK(input->GetItemAt(&str, {}));
+
+  RuneStrArray runes;
+  if (!DecodeRunesInString(str.data(), str.size(), runes)) {
+    RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
+  }
+  std::vector<std::string> splits;
+  int start = 0;
+  int len = 0;
+  for (size_t i = 0; i < runes.size(); i++) {
+    if (u_isUWhiteSpace(runes[i].rune)) {
+      if (len > 0) {
+        std::string temp(str.substr(start, len));
+        splits.emplace_back(std::move(temp));
+        len = 0;
+      }
+    } else {
+      if (len == 0) {
+        start = runes[i].offset;
+      }
+      len += runes[i].len;
+    }
+  }
+  if (len > 0) {
+    std::string temp(str.substr(start, len));
+    splits.emplace_back(std::move(temp));
+  }
+  if (splits.empty()) {
+    splits.emplace_back("");
+  }
+  *output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.h
@ -0,0 +1,39 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_WHITESPACE_TOKENIZER_OP_H_
+#define DATASET_TEXT_KERNELS_WHITESPACE_TOKENIZER_OP_H_
+#include <memory>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+class WhitespaceTokenizerOp : public TensorOp {
+ public:
+  WhitespaceTokenizerOp() {}
+
+  ~WhitespaceTokenizerOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "WhitespaceTokenizerOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_WHITESPACE_TOKENIZER_OP_H_
--- a/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc
+++ b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc
@ -0,0 +1,138 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dataset/text/kernels/wordpiece_tokenizer_op.h"
+#include <algorithm>
+#include <utility>
+
+namespace mindspore {
+namespace dataset {
+
+const char WordpieceTokenizerOp::kDefSuffixIndicator[] = "##";
+const int WordpieceTokenizerOp::kDefMaxBytesPerToken = 100;
+const char WordpieceTokenizerOp::kDefUnknownToken[] = "[UNK]";
+
+WordpieceTokenizerOp::WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
+                                           const int &max_bytes_per_token, const std::string &unknown_token)
+    : vocab_(vocab),
+      suffix_indicator_(suffix_indicator),
+      max_bytes_per_token_(max_bytes_per_token),
+      unknown_token_(unknown_token) {}
+
+void WordpieceTokenizerOp::PadTokens(const std::vector<std::vector<std::string>> &tokens, const std::string &padded_str,
+                                     std::vector<std::string> *out_padded_tokens, int *out_cols) const {
+  int rows = tokens.size();
+  int max_cols = 0;
+  for (int i = 0; i < rows; i++) {
+    max_cols = std::max(max_cols, static_cast<int>(tokens[i].size()));
+  }
+  out_padded_tokens->resize(rows * max_cols, padded_str);
+  for (int i = 0; i < rows; i++) {
+    int index = i * max_cols;
+    for (int j = 0; j < tokens[i].size(); j++) {
+      (*out_padded_tokens)[index++] = tokens[i][j];
+    }
+  }
+  *out_cols = max_cols;
+}
+
+Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start,
+                                        bool *out_found, int *out_end) const {
+  CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && start < input_token.size(), "Out of range");
+  *out_found = false;
+  for (int i = runes.size() - 1; i >= 0; i--) {
+    *out_end = runes[i].offset + runes[i].len;
+    int len = *out_end - start;
+    std::string word = input_token.substr(start, len);
+    if (start > 0) {
+      word = suffix_indicator_ + word;
+    }
+    WordIdType default_id = -1;
+    if (vocab_->Lookup(word, default_id) != default_id) {
+      *out_found = true;
+      break;
+    }
+  }
+  return Status::OK();
+}
+
+Status WordpieceTokenizerOp::FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const {
+  out_tokens->clear();
+  if (unknown_token_.empty()) {
+    out_tokens->emplace_back(input_token);
+  } else {
+    out_tokens->emplace_back(unknown_token_);
+  }
+  return Status::OK();
+}
+
+Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const int start, const int end,
+                                        std::vector<std::string> *out_tokens) const {
+  CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && end > start && end <= input_token.size(), "Out of range");
+  std::string subword = input_token.substr(start, end - start);
+  if (start > 0) {
+    subword = suffix_indicator_ + subword;
+  }
+  out_tokens->emplace_back(subword);
+  return Status::OK();
+}
+
+Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vector<std::string> *out_tokens) const {
+  if (input_token.size() > max_bytes_per_token_) {
+    return FoundNoToken(input_token, out_tokens);
+  }
+  RuneStrArray runes;
+  if (!DecodeRunesInString(input_token.data(), input_token.size(), runes)) {
+    RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
+  }
+  int end;
+  for (int start = 0; start < input_token.size();) {
+    bool found;
+    RETURN_IF_NOT_OK(LookupWord(input_token, runes, start, &found, &end));
+    if (found) {
+      RETURN_IF_NOT_OK(AddSubword(input_token, start, end, out_tokens));
+      start = end;
+    } else {
+      return FoundNoToken(input_token, out_tokens);
+    }
+  }
+  return Status::OK();
+}
+
+Status WordpieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  if (input->Rank() > 1 || input->type() != DataType::DE_STRING) {
+    RETURN_STATUS_UNEXPECTED("The input tensor should be scalar or 1-D string tensor");
+  }
+  std::vector<std::vector<std::string>> out_tokens(input->Size());
+  int i = 0;
+  for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
+    RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &out_tokens[i++]));
+  }
+  std::vector<std::string> padded_tokens;
+  int cols = 0;
+  PadTokens(out_tokens, "<pad>", &padded_tokens, &cols);
+  std::vector<dsize_t> shapes;
+  if (input->Rank() == 1) {
+    shapes.push_back(out_tokens.size());
+  }
+  shapes.push_back(cols);
+  *output = std::make_shared<Tensor>(std::move(padded_tokens), TensorShape(shapes));
+  return Status::OK();
+}
+
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h
+++ b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h
@ -0,0 +1,68 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_TEXT_KERNELS_WORDPIECE_TOKENIZER_OP_H_
+#define DATASET_TEXT_KERNELS_WORDPIECE_TOKENIZER_OP_H_
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "cppjieba/Unicode.hpp"
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+#include "dataset/text/vocab.h"
+#include "dataset/util/status.h"
+
+using cppjieba::DecodeRunesInString;
+using cppjieba::RuneStrArray;
+namespace mindspore {
+namespace dataset {
+
+class WordpieceTokenizerOp : public TensorOp {
+ public:
+  static const char kDefSuffixIndicator[];
+  static const int kDefMaxBytesPerToken;
+  static const char kDefUnknownToken[];
+  WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = kDefSuffixIndicator,
+                       const int &max_bytes_per_token = kDefMaxBytesPerToken,
+                       const std::string &unknown_token = kDefUnknownToken);
+
+  ~WordpieceTokenizerOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "WordpieceTokenizerOp"; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ protected:
+  void PadTokens(const std::vector<std::vector<std::string>> &tokens, const std::string &padded_str,
+                 std::vector<std::string> *out_padded_tokens, int *out_cols) const;
+  Status AddSubword(const std::string &input_token, const int start, const int end,
+                    std::vector<std::string> *out_token) const;
+  Status FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const;
+  Status LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start, bool *out_found,
+                    int *out_end) const;
+  Status GetTokens(const std::string &input_token, std::vector<std::string> *out_tokens) const;
+
+ private:
+  const std::shared_ptr<Vocab> vocab_;
+  const std::string suffix_indicator_;
+  const int max_bytes_per_token_;
+  const std::string unknown_token_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_TEXT_KERNELS_WORDPIECE_TOKENIZER_OP_H_
--- a/mindspore/dataset/text/init.py
+++ b/mindspore/dataset/text/init.py
@ -15,5 +15,18 @@
 """
 mindspore.dataset.text
 """
-from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram
-from .utils import to_str, to_bytes, JiebaMode, Vocab
+import platform
+from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer
+from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm
+
+__all__ = [
+    "Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram",
+    "to_str", "to_bytes", "JiebaMode", "Vocab", "WordpieceTokenizer"
+]
+
+if platform.system().lower() != 'windows':
+    from .transforms import UnicodeScriptTokenizer, WhitespaceTokenizer, CaseFold, NormalizeUTF8, \
+        RegexReplace, RegexTokenizer, BasicTokenizer, BertTokenizer
+
+    __all__.append(["UnicodeScriptTokenizer", "WhitespaceTokenizer", "CaseFold", "NormalizeUTF8",
+                    "RegexReplace", "RegexTokenizer", "BasicTokenizer", "BertTokenizer", "NormalizeForm"])
--- a/mindspore/dataset/text/transforms.py
+++ b/mindspore/dataset/text/transforms.py
@ -17,10 +17,11 @@ c transforms for all text related operators

 import os
 import re
+import platform

 import mindspore._c_dataengine as cde

-from .utils import JiebaMode
+from .utils import JiebaMode, NormalizeForm
 from .validators import check_lookup, check_jieba_add_dict, \
    check_jieba_add_word, check_jieba_init, check_ngram

@ -174,3 +175,172 @@ class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp):
    """
    Tokenize a scalar tensor of UTF-8 string to Unicode characters.
    """
+
+
+class WordpieceTokenizer(cde.WordpieceTokenizerOp):
+    """
+    Tokenize scalar token or 1-D tokens to subword tokens.
+
+    Args
+        vocab(Vocab): a Vocab object.
+        suffix_indicator(string, optional): Used to show that the subword is the last part of a word(default '##').
+        max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default 100).
+        unknown_token(string, optional): When we can not found the token: if 'unknown_token' is empty string,
+            return the token directly, else return 'unknown_token'(default '[UNK]').
+    """
+
+    def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]'):
+        self.vocab = vocab
+        self.suffix_indicator = suffix_indicator
+        self.max_bytes_per_token = max_bytes_per_token
+        self.unknown_token = unknown_token
+        super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token)
+
+
+if platform.system().lower() != 'windows':
+    class WhitespaceTokenizer(cde.WhitespaceTokenizerOp):
+        """
+        Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\t', '\r', '\n').
+        """
+
+
+    class UnicodeScriptTokenizer(cde.UnicodeScriptTokenizerOp):
+        """
+        Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
+
+        Args:
+            keep_whitespace(bool, optional): If or not emit whitespace tokens (default False)
+        """
+
+        def __init__(self, keep_whitespace=False):
+            self.keep_whitespace = keep_whitespace
+            super().__init__(self.keep_whitespace)
+
+
+    class CaseFold(cde.CaseFoldOp):
+        """
+        Apply case fold operation on utf-8 string tensor.
+        """
+
+
+    DE_C_INTER_NORMALIZE_FORM = {
+        NormalizeForm.NONE: cde.NormalizeForm.DE_NORMALIZE_NONE,
+        NormalizeForm.NFC: cde.NormalizeForm.DE_NORMALIZE_NFC,
+        NormalizeForm.NFKC: cde.NormalizeForm.DE_NORMALIZE_NFKC,
+        NormalizeForm.NFD: cde.NormalizeForm.DE_NORMALIZE_NFD,
+        NormalizeForm.NFKD: cde.NormalizeForm.DE_NORMALIZE_NFKD
+    }
+
+
+    class NormalizeUTF8(cde.NormalizeUTF8Op):
+        """
+        Apply normalize operation on utf-8 string tensor.
+
+        Args:
+            normalize_form(Enum, optional): Valid values are "NONE", "NFC", "NFKC", "NFD", "NFKD".
+                If set "NONE", will do nothing for input string tensor.
+                If set to any of "NFC", "NFKC", "NFD", "NFKD", will apply normalize operation(default "NFKC").
+                See http://unicode.org/reports/tr15/ for details.
+        """
+
+        def __init__(self, normalize_form=NormalizeForm.NFKC):
+            self.normalize_form = DE_C_INTER_NORMALIZE_FORM[normalize_form]
+            super().__init__(self.normalize_form)
+
+
+    class RegexReplace(cde.RegexReplaceOp):
+        """
+        Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'.
+        See http://userguide.icu-project.org/strings/regexp for support regex pattern.
+
+        Args:
+            pattern(string): the regex expression patterns.
+            replace(string): the string to replace matched element.
+            replace_all(bool, optional): If False, only replace first matched element;
+                if True, replace all matched elements(default True).
+        """
+
+        def __init__(self, pattern, replace, replace_all=True):
+            self.pattern = pattern
+            self.replace = replace
+            self.replace_all = replace_all
+            super().__init__(self.pattern, self.replace, self.replace_all)
+
+
+    class RegexTokenizer(cde.RegexTokenizerOp):
+        """
+        Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
+        See http://userguide.icu-project.org/strings/regexp for support regex pattern.
+
+        Args:
+            delim_pattern(string): The pattern of regex delimiters.
+                The original string will be split by matched elements.
+            keep_delim_pattern(string, optional): The string matched by 'delim_pattern' can be kept as a token
+                if it can be matched by 'keep_delim_pattern'. And the default value is empty string(''),
+                in this situation, delimiters will not kept as a output token.
+        """
+
+        def __init__(self, delim_pattern, keep_delim_pattern=''):
+            self.delim_pattern = delim_pattern
+            self.keep_delim_pattern = keep_delim_pattern
+            super().__init__(self.delim_pattern, self.keep_delim_pattern)
+
+
+    class BasicTokenizer(cde.BasicTokenizerOp):
+        """
+        Tokenize a scalar tensor of UTF-8 string by specific rules.
+
+        Args:
+            lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
+                on input text to make the text to lower case and strip accents characters; If False, only apply
+                NormalizeUTF8('normalization_form' mode) operation on input text(default False).
+            keep_whitespace(bool, optional), If True, the whitespace will be kept in out tokens(default False).
+            normalization_form(Enum, optional), Used to specify a specific normlaize mode,
+                only effective when 'lower_case' is False. See NormalizeUTF8 for details(default 'NONE').
+            preserve_unused_token(bool, optional), If True, do not split special tokens like
+                '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default True).
+        """
+
+        def __init__(self, lower_case=False, keep_whitespace=False,
+                     normalization_form=NormalizeForm.NONE, preserve_unused_token=True):
+            self.lower_case = lower_case
+            self.keep_whitespace = keep_whitespace
+            self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
+            self.preserve_unused_token = preserve_unused_token
+            super().__init__(self.lower_case, self.keep_whitespace,
+                             self.normalization_form, self.preserve_unused_token)
+
+
+    class BertTokenizer(cde.BertTokenizerOp):
+        """
+        Tokenizer used for Bert text process.
+
+        Args:
+            vocab(Vocab): a Vocab object.
+            suffix_indicator(string, optional): Used to show that the subword is the last part of a word(default '##').
+            max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default 100).
+            unknown_token(string, optional): When we can not found the token: if 'unknown_token' is empty string,
+                return the token directly, else return 'unknown_token'(default '[UNK]').
+            lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
+                on input text to make the text to lower case and strip accents characters; If False, only apply
+                NormalizeUTF8('normalization_form' mode) operation on input text(default False).
+            keep_whitespace(bool, optional), If True, the whitespace will be kept in out tokens(default False).
+            normalization_form(Enum, optional), Used to specify a specific normlaize mode,
+                only effective when 'lower_case' is False. See NormalizeUTF8 for details(default 'NONE').
+            preserve_unused_token(bool, optional), If True, do not split special tokens like
+                '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default True).
+        """
+
+        def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100,
+                     unknown_token='[UNK]', lower_case=False, keep_whitespace=False,
+                     normalization_form=NormalizeForm.NONE, preserve_unused_token=True):
+            self.vocab = vocab
+            self.suffix_indicator = suffix_indicator
+            self.max_bytes_per_token = max_bytes_per_token
+            self.unknown_token = unknown_token
+            self.lower_case = lower_case
+            self.keep_whitespace = keep_whitespace
+            self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
+            self.preserve_unused_token = preserve_unused_token
+            super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token,
+                             self.lower_case, self.keep_whitespace, self.normalization_form, self.preserve_unused_token)
--- a/mindspore/dataset/text/utils.py
+++ b/mindspore/dataset/text/utils.py
@ -127,3 +127,11 @@ class JiebaMode(IntEnum):
    MIX = 0
    MP = 1
    HMM = 2
+
+
+class NormalizeForm(IntEnum):
+    NONE = 0
+    NFC = 1
+    NFKC = 2
+    NFD = 3
+    NFKD = 4
--- a/tests/ut/cpp/dataset/tokenizer_op_test.cc
+++ b/tests/ut/cpp/dataset/tokenizer_op_test.cc
@ -18,7 +18,14 @@
 #include <string_view>

 #include "common/common.h"
+#include "dataset/text/kernels/basic_tokenizer_op.h"
+#include "dataset/text/kernels/case_fold_op.h"
+#include "dataset/text/kernels/normalize_utf8_op.h"
+#include "dataset/text/kernels/regex_replace_op.h"
+#include "dataset/text/kernels/regex_tokenizer_op.h"
 #include "dataset/text/kernels/unicode_char_tokenizer_op.h"
+#include "dataset/text/kernels/unicode_script_tokenizer_op.h"
+#include "dataset/text/kernels/whitespace_tokenizer_op.h"
 #include "gtest/gtest.h"
 #include "utils/log_adapter.h"

@ -105,3 +112,229 @@ TEST_F(MindDataTestTokenizerOp, TestUnicodeCharTokenizerOp) {
  MS_LOG(INFO) << "Out tensor6: " << output->ToString();
  CheckEqual(output, {0}, "");
 }
+
+TEST_F(MindDataTestTokenizerOp, TestWhitespaceTokenizerOp) {
+  MS_LOG(INFO) << "Doing TestWhitespaceTokenizerOp.";
+  std::unique_ptr<WhitespaceTokenizerOp> op(new WhitespaceTokenizerOp());
+  std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China.");
+  std::shared_ptr<Tensor> output;
+  Status s = op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 3);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor1: " << output->ToString();
+  CheckEqual(output, {0}, "Welcome");
+  CheckEqual(output, {1}, "to");
+  CheckEqual(output, {2}, "China.");
+
+  input = std::make_shared<Tensor>("  hello");
+  s = op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor2: " << output->ToString();
+  CheckEqual(output, {0}, "hello");
+
+  input = std::make_shared<Tensor>("hello");
+  s = op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor3: " << output->ToString();
+  CheckEqual(output, {0}, "hello");
+
+  input = std::make_shared<Tensor>("hello  ");
+  s = op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor4: " << output->ToString();
+  CheckEqual(output, {0}, "hello");
+
+  input = std::make_shared<Tensor>("  ");
+  s = op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor5: " << output->ToString();
+  CheckEqual(output, {0}, "");
+}
+
+TEST_F(MindDataTestTokenizerOp, TestUnicodeScriptTokenizer) {
+  MS_LOG(INFO) << "Doing TestUnicodeScriptTokenizer.";
+  std::unique_ptr<UnicodeScriptTokenizerOp> keep_whitespace_op(new UnicodeScriptTokenizerOp(true));
+  std::unique_ptr<UnicodeScriptTokenizerOp> skip_whitespace_op(new UnicodeScriptTokenizerOp(false));
+
+  std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
+  std::shared_ptr<Tensor> output;
+  Status s = keep_whitespace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 10);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor1: " << output->ToString();
+  CheckEqual(output, {0}, "Welcome");
+  CheckEqual(output, {1}, " ");
+  CheckEqual(output, {2}, "to");
+  CheckEqual(output, {3}, " ");
+  CheckEqual(output, {4}, "China");
+  CheckEqual(output, {5}, ".");
+  CheckEqual(output, {6}, " \n ");
+  CheckEqual(output, {7}, "中国");
+  CheckEqual(output, {8}, "\t");
+  CheckEqual(output, {9}, "北京");
+  s = skip_whitespace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 6);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor2: " << output->ToString();
+  CheckEqual(output, {0}, "Welcome");
+  CheckEqual(output, {1}, "to");
+  CheckEqual(output, {2}, "China");
+  CheckEqual(output, {3}, ".");
+  CheckEqual(output, {4}, "中国");
+  CheckEqual(output, {5}, "北京");
+
+  input = std::make_shared<Tensor>("  Welcome to 中国.  ");
+  s = skip_whitespace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 4);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor3: " << output->ToString();
+  CheckEqual(output, {0}, "Welcome");
+  CheckEqual(output, {1}, "to");
+  CheckEqual(output, {2}, "中国");
+  CheckEqual(output, {3}, ".");
+  s = keep_whitespace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 8);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor4: " << output->ToString();
+  CheckEqual(output, {0}, "  ");
+  CheckEqual(output, {1}, "Welcome");
+  CheckEqual(output, {2}, " ");
+  CheckEqual(output, {3}, "to");
+  CheckEqual(output, {4}, " ");
+  CheckEqual(output, {5}, "中国");
+  CheckEqual(output, {6}, ".");
+  CheckEqual(output, {7}, "  ");
+
+  input = std::make_shared<Tensor>("Hello");
+  s = keep_whitespace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor5: " << output->ToString();
+  CheckEqual(output, {0}, "Hello");
+
+  input = std::make_shared<Tensor>("H");
+  s = keep_whitespace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor6: " << output->ToString();
+  CheckEqual(output, {0}, "H");
+
+  input = std::make_shared<Tensor>("");
+  s = keep_whitespace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor7: " << output->ToString();
+  CheckEqual(output, {0}, "");
+
+  input = std::make_shared<Tensor>("Hello中国Hello世界");
+  s = keep_whitespace_op->Compute(input, &output); EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 4);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor8: " << output->ToString();
+  CheckEqual(output, {0}, "Hello");
+  CheckEqual(output, {1}, "中国");
+  CheckEqual(output, {2}, "Hello");
+  CheckEqual(output, {3}, "世界");
+
+  input = std::make_shared<Tensor>("   ");
+  s = keep_whitespace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor10: " << output->ToString();
+  CheckEqual(output, {0}, "   ");
+  input = std::make_shared<Tensor>("   ");
+  s = skip_whitespace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 1);
+  MS_LOG(INFO) << "Out tensor11: " << output->ToString();
+  CheckEqual(output, {0}, "");
+}
+
+TEST_F(MindDataTestTokenizerOp, TestCaseFold) {
+  MS_LOG(INFO) << "Doing TestCaseFold.";
+  std::unique_ptr<CaseFoldOp> case_fold_op(new CaseFoldOp());
+  std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
+  std::shared_ptr<Tensor> output;
+  Status s = case_fold_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 0);
+  MS_LOG(INFO) << "Out tensor1: " << output->ToString();
+  CheckEqual(output, {}, "welcome to china. \n 中国\t北京");
+}
+
+TEST_F(MindDataTestTokenizerOp, TestNormalize) {
+  MS_LOG(INFO) << "Doing TestNormalize.";
+  std::unique_ptr<NormalizeUTF8Op> nfc_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfc));
+  std::unique_ptr<NormalizeUTF8Op> nfkc_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfkc));
+  std::unique_ptr<NormalizeUTF8Op> nfd_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfd));
+  std::unique_ptr<NormalizeUTF8Op> nfkd_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfkd));
+  std::shared_ptr<Tensor> input = std::make_shared<Tensor>("ṩ");
+  std::shared_ptr<Tensor> output;
+  Status s = nfc_normalize_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  MS_LOG(INFO) << "NFC str:" << output->ToString();
+
+  nfkc_normalize_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  MS_LOG(INFO) << "NFKC str:" << output->ToString();
+
+  nfd_normalize_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  MS_LOG(INFO) << "NFD str:" << output->ToString();
+
+  nfkd_normalize_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  MS_LOG(INFO) << "NFKD str:" << output->ToString();
+}
+
+TEST_F(MindDataTestTokenizerOp, TestRegexReplace) {
+  MS_LOG(INFO) << "Doing TestRegexReplace.";
+  std::unique_ptr<RegexReplaceOp> regex_replace_op(new RegexReplaceOp("\\s+", "_", true));
+  std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
+  std::shared_ptr<Tensor> output;
+  Status s = regex_replace_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+  EXPECT_EQ(output->Size(), 1);
+  EXPECT_EQ(output->Rank(), 0);
+  MS_LOG(INFO) << "Out tensor1: " << output->ToString();
+  CheckEqual(output, {}, "Welcome_to_China._中国_北京");
+}
+
+TEST_F(MindDataTestTokenizerOp, TestRegexTokenizer) {
+  MS_LOG(INFO) << "Doing TestRegexTokenizerOp.";
+  std::unique_ptr<RegexTokenizerOp> regex_tokenizer_op(new RegexTokenizerOp("\\p{Cc}|\\p{Cf}|\\s+", ""));
+  std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
+  std::shared_ptr<Tensor> output;
+  Status s = regex_tokenizer_op->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+}
+
+TEST_F(MindDataTestTokenizerOp, TestBasicTokenizer) {
+  MS_LOG(INFO) << "Doing TestBasicTokenizer.";
+  //bool lower_case, bool keep_whitespace, 
+  // NormalizeForm  normalization_form, bool preserve_unused_token
+  std::unique_ptr<BasicTokenizerOp> basic_tokenizer(new BasicTokenizerOp(true, true, NormalizeForm::kNone, false));
+  std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. 中国\t北京");
+  std::shared_ptr<Tensor> output;
+  Status s = basic_tokenizer->Compute(input, &output);
+  EXPECT_TRUE(s.IsOk());
+}
--- a/tests/ut/data/dataset/testTokenizerData/basic_tokenizer.txt
+++ b/tests/ut/data/dataset/testTokenizerData/basic_tokenizer.txt
@ -0,0 +1,7 @@
+Welcome to Beijing北京欢迎您
+長風破浪會有時，直掛雲帆濟滄海
+😀嘿嘿😃哈哈😄大笑😁嘻嘻
+明朝（1368—1644年）和清朝（1644—1911年），是中国封建王朝史上最后两个朝代
+明代（1368-1644）と清代（1644-1911）は、中国の封建王朝の歴史における最後の2つの王朝でした
+명나라 (1368-1644)와 청나라 (1644-1911)는 중국 봉건 왕조의 역사에서 마지막 두 왕조였다
+Tĥïŝ ĩš â fůňķŷ Šťŕĭńġ
--- a/tests/ut/data/dataset/testTokenizerData/bert_tokenizer.txt
+++ b/tests/ut/data/dataset/testTokenizerData/bert_tokenizer.txt
@ -0,0 +1,14 @@
+床前明月光
+疑是地上霜
+举头望明月
+低头思故乡
+I am making small mistakes during working hours
+😀嘿嘿😃哈哈😄大笑😁嘻嘻
+繁體字
+unused [CLS]
+unused [SEP]
+unused [UNK]
+unused [PAD]
+unused [MASK]
+12+/-28=40/-16
+Hello World!
--- a/tests/ut/data/dataset/testTokenizerData/normalize.txt
+++ b/tests/ut/data/dataset/testTokenizerData/normalize.txt
@ -0,0 +1,6 @@
+ṩ
+ḍ̇
+q̣̇
+ﬁ
+2⁵
+ẛ̣
--- a/tests/ut/data/dataset/testTokenizerData/regex_replace.txt
+++ b/tests/ut/data/dataset/testTokenizerData/regex_replace.txt
@ -0,0 +1,8 @@
+Hello World
+Let's Go
+1:hello
+2:world
+31:beijing
+Welcome to China!
+  我	不想  长大	
+Welcome to Shenzhen!
--- a/tests/ut/data/dataset/testTokenizerData/regex_tokenizer.txt
+++ b/tests/ut/data/dataset/testTokenizerData/regex_tokenizer.txt
@ -0,0 +1,3 @@
+Welcome to Shenzhen!
+北京欢迎您!Welcome to Beijing!
+12￥+36￥=?
--- a/tests/ut/data/dataset/testTokenizerData/wordpiece_tokenizer.txt
+++ b/tests/ut/data/dataset/testTokenizerData/wordpiece_tokenizer.txt
@ -0,0 +1,25 @@
+my
+favorite
+book
+is
+love
+during
+the
+cholera
+era
+what
+我
+最
+喜
+欢
+的
+书
+是
+霍
+乱
+时
+期
+的
+爱
+情
+您
--- a/tests/ut/python/dataset/test_basic_tokenizer.py
+++ b/tests/ut/python/dataset/test_basic_tokenizer.py
@ -0,0 +1,83 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing BasicTokenizer op in DE
+"""
+import numpy as np
+import mindspore.dataset as ds
+from mindspore import log as logger
+import mindspore.dataset.text as nlp
+
+BASIC_TOKENIZER_FILE = "../data/dataset/testTokenizerData/basic_tokenizer.txt"
+
+test_paras = [
+    dict(
+        first=1,
+        last=6,
+        expected_tokens=
+        [['Welcome', 'to', 'Beijing', '北', '京', '欢', '迎', '您'],
+         ['長', '風', '破', '浪', '會', '有', '時', '，', '直', '掛', '雲', '帆', '濟', '滄', '海'],
+         ['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'],
+         ['明', '朝', '（', '1368', '—', '1644', '年', '）', '和', '清', '朝',
+          '（', '1644', '—', '1911', '年', '）', '，', '是', '中', '国', '封',
+          '建', '王', '朝', '史', '上', '最', '后', '两', '个', '朝', '代'],
+         ['明', '代', '（', '1368', '-', '1644', '）', 'と', '清', '代',
+          '（', '1644', '-', '1911', '）', 'は', '、', '中', '国', 'の', '封',
+          '建', '王', '朝', 'の', '歴', '史', 'における', '最', '後', 'の2つの', '王', '朝', 'でした'],
+         ['명나라', '(', '1368', '-', '1644', ')', '와', '청나라', '(', '1644', '-', '1911', ')', '는',
+          '중국', '봉건', '왕조의', '역사에서', '마지막', '두', '왕조였다']]
+    ),
+    dict(
+        first=7,
+        last=7,
+        expected_tokens=[['this', 'is', 'a', 'funky', 'string']],
+        lower_case=True
+    ),
+]
+
+
+def check_basic_tokenizer(first, last, expected_tokens, lower_case=False, keep_whitespace=False,
+                          normalization_form=nlp.utils.NormalizeForm.NONE, preserve_unused_token=False):
+    dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False)
+    if first > 1:
+        dataset = dataset.skip(first - 1)
+    if last >= first:
+        dataset = dataset.take(last - first + 1)
+
+    basic_tokenizer = nlp.BasicTokenizer(lower_case=lower_case,
+                                         keep_whitespace=keep_whitespace,
+                                         normalization_form=normalization_form,
+                                         preserve_unused_token=preserve_unused_token)
+
+    dataset = dataset.map(operations=basic_tokenizer)
+    count = 0
+    for i in dataset.create_dict_iterator():
+        text = nlp.to_str(i['text'])
+        logger.info("Out:", text)
+        logger.info("Exp:", expected_tokens[count])
+        np.testing.assert_array_equal(text, expected_tokens[count])
+        count = count + 1
+
+
+def test_basic_tokenizer():
+    """
+    Test BasicTokenizer
+    """
+    for paras in test_paras:
+        check_basic_tokenizer(**paras)
+
+
+if __name__ == '__main__':
+    test_basic_tokenizer()
--- a/tests/ut/python/dataset/test_bert_tokenizer.py
+++ b/tests/ut/python/dataset/test_bert_tokenizer.py
@ -0,0 +1,183 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing BertTokenizer op in DE
+"""
+import numpy as np
+import mindspore.dataset as ds
+from mindspore import log as logger
+import mindspore.dataset.text as nlp
+
+BERT_TOKENIZER_FILE = "../data/dataset/testTokenizerData/bert_tokenizer.txt"
+
+vocab_bert = [
+    "床", "前", "明", "月", "光", "疑", "是", "地", "上", "霜", "举", "头", "望", "低", "思", "故", "乡",
+    "繁", "體", "字", "嘿", "哈", "大", "笑", "嘻",
+    "i", "am", "mak", "make", "small", "mistake", "##s", "during", "work", "##ing", "hour",
+    "😀", "😃", "😄", "😁", "+", "/", "-", "=", "12", "28", "40", "16", " ", "I",
+    "[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]"
+]
+pad = '<pad>'
+test_paras = [
+    # test chinese text
+    dict(
+        first=1,
+        last=4,
+        expect_str=[[['床'], ['前'], ['明'], ['月'], ['光']],
+                    [['疑'], ['是'], ['地'], ['上'], ['霜']],
+                    [['举'], ['头'], ['望'], ['明'], ['月']],
+                    [['低'], ['头'], ['思'], ['故'], ['乡']]],
+        vocab_list=vocab_bert
+    ),
+    # test english text
+    dict(
+        first=5,
+        last=5,
+        expect_str=[[['i', pad],
+                     ["am", pad],
+                     ['mak', '##ing'],
+                     ['small', pad],
+                     ['mistake', '##s'],
+                     ['during', pad],
+                     ['work', '##ing'],
+                     ['hour', '##s']]],
+        lower_case=True,
+        vocab_list=vocab_bert
+    ),
+    dict(
+        first=5,
+        last=5,
+        expect_str=[[['I', pad],
+                     ["am", pad],
+                     ['mak', '##ing'],
+                     ['small', pad],
+                     ['mistake', '##s'],
+                     ['during', pad],
+                     ['work', '##ing'],
+                     ['hour', '##s']]],
+        lower_case=False,
+        vocab_list=vocab_bert
+    ),
+    # test emoji tokens
+    dict(
+        first=6,
+        last=7,
+        expect_str=[
+            [['😀'], ['嘿'], ['嘿'], ['😃'], ['哈'], ['哈'], ['😄'], ['大'], ['笑'], ['😁'], ['嘻'], ['嘻']],
+            [['繁'], ['體'], ['字']]],
+        normalization_form=nlp.utils.NormalizeForm.NFKC,
+        vocab_list=vocab_bert
+    ),
+    # test preserved tokens
+    dict(
+        first=8,
+        last=12,
+        expect_str=[
+            [['[UNK]'], ['[CLS]']],
+            [['[UNK]'], ['[SEP]']],
+            [['[UNK]'], ['[UNK]']],
+            [['[UNK]'], ['[PAD]']],
+            [['[UNK]'], ['[MASK]']],
+        ],
+        lower_case=False,
+        vocab_list=vocab_bert,
+        preserve_unused_token=True,
+    ),
+    # test special symbol
+    dict(
+        first=13,
+        last=13,
+        expect_str=[[['12'], ['+'], ['/'], ['-'], ['28'], ['='], ['40'], ['/'], ['-'], ['16']]],
+        preserve_unused_token=True,
+        vocab_list=vocab_bert
+    ),
+    # test non-default parms
+    dict(
+        first=8,
+        last=8,
+        expect_str=[
+            [['[UNK]'], [' '], ['[CLS]']],
+        ],
+        lower_case=False,
+        vocab_list=vocab_bert,
+        preserve_unused_token=True,
+        keep_whitespace=True
+    ),
+    dict(
+        first=8,
+        last=8,
+        expect_str=[
+            [['unused'], [' '], ['[CLS]']],
+        ],
+        lower_case=False,
+        vocab_list=vocab_bert,
+        preserve_unused_token=True,
+        keep_whitespace=True,
+        unknown_token=''
+    ),
+    dict(
+        first=8,
+        last=8,
+        expect_str=[
+            [['unused'], [' '], ['['], ['CLS'], [']']],
+        ],
+        lower_case=False,
+        vocab_list=vocab_bert,
+        preserve_unused_token=False,
+        keep_whitespace=True,
+        unknown_token=''
+    ),
+]
+
+
+def check_bert_tokenizer(first, last, expect_str,
+                         vocab_list,
+                         suffix_indicator='##',
+                         max_bytes_per_token=100, unknown_token='[UNK]',
+                         lower_case=False, keep_whitespace=False,
+                         normalization_form=nlp.utils.NormalizeForm.NONE,
+                         preserve_unused_token=False):
+    dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False)
+    if first > 1:
+        dataset = dataset.skip(first - 1)
+    if last >= first:
+        dataset = dataset.take(last - first + 1)
+    vocab = nlp.Vocab.from_list(vocab_list)
+    tokenizer_op = nlp.BertTokenizer(
+        vocab=vocab, suffix_indicator=suffix_indicator,
+        max_bytes_per_token=max_bytes_per_token, unknown_token=unknown_token,
+        lower_case=lower_case, keep_whitespace=keep_whitespace,
+        normalization_form=normalization_form,
+        preserve_unused_token=preserve_unused_token)
+    dataset = dataset.map(operations=tokenizer_op)
+    count = 0
+    for i in dataset.create_dict_iterator():
+        text = nlp.to_str(i['text'])
+        logger.info("Out:", text)
+        logger.info("Exp:", expect_str[count])
+        np.testing.assert_array_equal(text, expect_str[count])
+        count = count + 1
+
+
+def test_bert_tokenizer():
+    """
+    Test WordpieceTokenizer
+    """
+    for paras in test_paras:
+        check_bert_tokenizer(**paras)
+
+
+if __name__ == '__main__':
+    test_bert_tokenizer()
--- a/tests/ut/python/dataset/test_tokenizer.py
+++ b/tests/ut/python/dataset/test_tokenizer.py
@ -15,11 +15,15 @@
 """
 Testing UnicodeCharTokenizer op in DE
 """
+import numpy as np
 import mindspore.dataset as ds
 from mindspore import log as logger
 import mindspore.dataset.text as nlp

 DATA_FILE = "../data/dataset/testTokenizerData/1.txt"
+NORMALIZE_FILE = "../data/dataset/testTokenizerData/normalize.txt"
+REGEX_REPLACE_FILE = "../data/dataset/testTokenizerData/regex_replace.txt"
+REGEX_TOKENIZER_FILE = "../data/dataset/testTokenizerData/regex_tokenizer.txt"


 def split_by_unicode_char(input_strs):
@ -48,5 +52,182 @@ def test_unicode_char_tokenizer():
    assert split_by_unicode_char(input_strs) == tokens


+def test_whitespace_tokenizer():
+    """
+    Test WhitespaceTokenizer
+    """
+    whitespace_strs = [["Welcome", "to", "Beijing!"],
+                       ["北京欢迎您！"],
+                       ["我喜欢English!"],
+                       [""]]
+    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
+    tokenizer = nlp.WhitespaceTokenizer()
+    dataset = dataset.map(operations=tokenizer)
+    tokens = []
+    for i in dataset.create_dict_iterator():
+        text = nlp.to_str(i['text']).tolist()
+        tokens.append(text)
+    logger.info("The out tokens is : {}".format(tokens))
+    assert whitespace_strs == tokens
+
+
+def test_unicode_script_tokenizer():
+    """
+    Test UnicodeScriptTokenizer when para keep_whitespace=False
+    """
+    unicode_script_strs = [["Welcome", "to", "Beijing", "!"],
+                           ["北京欢迎您", "！"],
+                           ["我喜欢", "English", "!"],
+                           [""]]
+    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
+    tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=False)
+    dataset = dataset.map(operations=tokenizer)
+
+    tokens = []
+    for i in dataset.create_dict_iterator():
+        text = nlp.to_str(i['text']).tolist()
+        tokens.append(text)
+    logger.info("The out tokens is : {}".format(tokens))
+    assert unicode_script_strs == tokens
+
+
+def test_unicode_script_tokenizer2():
+    """
+    Test UnicodeScriptTokenizer when para keep_whitespace=True
+    """
+    unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
+                            ["北京欢迎您", "！"],
+                            ["我喜欢", "English", "!"],
+                            ["  "]]
+    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
+    tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=True)
+    dataset = dataset.map(operations=tokenizer)
+    tokens = []
+    for i in dataset.create_dict_iterator():
+        text = nlp.to_str(i['text']).tolist()
+        tokens.append(text)
+    logger.info("The out tokens is :", tokens)
+    assert unicode_script_strs2 == tokens
+
+
+def test_case_fold():
+    """
+    Test CaseFold
+    """
+    expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", "  "]
+    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
+    op = nlp.CaseFold()
+    dataset = dataset.map(operations=op)
+
+    lower_strs = []
+    for i in dataset.create_dict_iterator():
+        text = nlp.to_str(i['text']).tolist()
+        lower_strs.append(text)
+    assert lower_strs == expect_strs
+
+
+def test_normalize_utf8():
+    """
+    Test NormalizeUTF8
+    """
+
+    def normalize(normalize_form):
+        dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False)
+        normalize = nlp.NormalizeUTF8(normalize_form=normalize_form)
+        dataset = dataset.map(operations=normalize)
+        out_bytes = []
+        out_texts = []
+        for i in dataset.create_dict_iterator():
+            out_bytes.append(i['text'])
+            out_texts.append(nlp.to_str(i['text']).tolist())
+        logger.info("The out bytes is : ", out_bytes)
+        logger.info("The out texts is: ", out_texts)
+        return out_bytes
+
+    expect_normlize_data = [
+        # NFC
+        [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
+         b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xe1\xba\x9b\xcc\xa3'],
+        # NFKC
+        [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
+         b'fi', b'25', b'\xe1\xb9\xa9'],
+        # NFD
+        [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
+         b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xc5\xbf\xcc\xa3\xcc\x87'],
+        # NFKD
+        [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
+         b'fi', b'25', b's\xcc\xa3\xcc\x87']
+    ]
+    assert normalize(nlp.utils.NormalizeForm.NFC) == expect_normlize_data[0]
+    assert normalize(nlp.utils.NormalizeForm.NFKC) == expect_normlize_data[1]
+    assert normalize(nlp.utils.NormalizeForm.NFD) == expect_normlize_data[2]
+    assert normalize(nlp.utils.NormalizeForm.NFKD) == expect_normlize_data[3]
+
+
+def test_regex_replace():
+    """
+    Test RegexReplace
+    """
+
+    def regex_replace(first, last, expect_str, pattern, replace):
+        dataset = ds.TextFileDataset(REGEX_REPLACE_FILE, shuffle=False)
+        if first > 1:
+            dataset = dataset.skip(first - 1)
+        if last >= first:
+            dataset = dataset.take(last - first + 1)
+        replace_op = nlp.RegexReplace(pattern, replace)
+        dataset = dataset.map(operations=replace_op)
+        out_text = []
+        for i in dataset.create_dict_iterator():
+            text = nlp.to_str(i['text']).tolist()
+            out_text.append(text)
+        logger.info("Out:", out_text)
+        logger.info("Exp:", expect_str)
+        assert expect_str == out_text
+
+    regex_replace(1, 2, ['H____ W____', "L__'_ G_"], "\\p{Ll}", '_')
+    regex_replace(3, 5, ['hello', 'world', '31:beijing'], "^(\\d:|b:)", "")
+    regex_replace(6, 6, ["WelcometoChina!"], "\\s+", "")
+    regex_replace(7, 8, ['我不想长大', 'WelcometoShenzhen!'], "\\p{Cc}|\\p{Cf}|\\s+", "")
+
+
+def test_regex_tokenizer():
+    """
+    Test RegexTokenizer
+    """
+
+    def regex_tokenizer(first, last, expect_str, delim_pattern, keep_delim_pattern):
+        dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
+        if first > 1:
+            dataset = dataset.skip(first - 1)
+        if last >= first:
+            dataset = dataset.take(last - first + 1)
+        tokenizer_op = nlp.RegexTokenizer(delim_pattern, keep_delim_pattern)
+        dataset = dataset.map(operations=tokenizer_op)
+        out_text = []
+        count = 0
+        for i in dataset.create_dict_iterator():
+            text = nlp.to_str(i['text']).tolist()
+            np.testing.assert_array_equal(text, expect_str[count])
+            count += 1
+            out_text.append(text)
+        logger.info("Out:", out_text)
+        logger.info("Exp:", expect_str)
+
+    regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], "\\s+", "")
+    regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], "\\s+", "\\s+")
+    regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], r"\p{Han}", r"\p{Han}")
+    regex_tokenizer(3, 3, [['12', '￥+', '36', '￥=?']], r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+")
+    regex_tokenizer(3, 3, [['12', '36']], r"[\p{P}|\p{S}]+", "")
+    regex_tokenizer(3, 3, [['￥+', '￥=?']], r"[\p{N}]+", "")
+
+
 if __name__ == '__main__':
    test_unicode_char_tokenizer()
+    test_whitespace_tokenizer()
+    test_unicode_script_tokenizer()
+    test_unicode_script_tokenizer2()
+    test_case_fold()
+    test_normalize_utf8()
+    test_regex_replace()
+    test_regex_tokenizer()
--- a/tests/ut/python/dataset/test_wordpiece_tokenizer.py
+++ b/tests/ut/python/dataset/test_wordpiece_tokenizer.py
@ -0,0 +1,113 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing WordpieceTokenizer op in DE
+"""
+import numpy as np
+import mindspore.dataset as ds
+from mindspore import log as logger
+import mindspore.dataset.text as nlp
+
+WORDPIECE_TOKENIZER_FILE = "../data/dataset/testTokenizerData/wordpiece_tokenizer.txt"
+
+vocab_english = [
+    "book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"
+]
+
+vocab_chinese = [
+    "我", '最', '喜', '欢', '的', '书', '是', '霍', '乱', '时', '期', '爱', '情'
+]
+
+vocab_mix = vocab_chinese + vocab_english
+
+test_paras = [
+    dict(
+        first=1,
+        last=10,
+        expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'],
+                    ['era'], ['[UNK]']],
+        vocab_list=vocab_english
+    ),
+    dict(
+        first=1,
+        last=10,
+        expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'],
+                    ['era'], ['what']],
+        vocab_list=vocab_english,
+        unknown_token=""
+    ),
+    dict(
+        first=1,
+        last=10,
+        expect_str=[['my'], ['[UNK]'], ['book'], ['is'], ['love'], ['[UNK]'], ['the'], ['[UNK]'], ['era'], ['[UNK]']],
+        vocab_list=vocab_english,
+        max_bytes_per_token=4
+    ),
+    dict(
+        first=11,
+        last=25,
+        expect_str=[['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'],
+                    ['[UNK]']],
+        vocab_list=vocab_chinese,
+    ),
+    dict(
+        first=25,
+        last=25,
+        expect_str=[['您']],
+        vocab_list=vocab_chinese,
+        unknown_token=""
+    ),
+    dict(
+        first=1,
+        last=25,
+        expect_str=[
+            ['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], ['era'],
+            ['[UNK]'],
+            ['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'],
+            ['[UNK]']],
+        vocab_list=vocab_mix,
+    ),
+]
+
+
+def check_wordpiece_tokenizer(first, last, expect_str, vocab_list, unknown_token='[UNK]', max_bytes_per_token=100):
+    dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False)
+    if first > 1:
+        dataset = dataset.skip(first - 1)
+    if last >= first:
+        dataset = dataset.take(last - first + 1)
+    vocab = nlp.Vocab.from_list(vocab_list)
+    tokenizer_op = nlp.WordpieceTokenizer(vocab=vocab, unknown_token=unknown_token,
+                                          max_bytes_per_token=max_bytes_per_token)
+    dataset = dataset.map(operations=tokenizer_op)
+    count = 0
+    for i in dataset.create_dict_iterator():
+        text = nlp.to_str(i['text'])
+        logger.info("Out:", text)
+        logger.info("Exp:", expect_str[count])
+        np.testing.assert_array_equal(text, expect_str[count])
+        count = count + 1
+
+
+def test_wordpiece_tokenizer():
+    """
+    Test WordpieceTokenizer
+    """
+    for paras in test_paras:
+        check_wordpiece_tokenizer(**paras)
+
+
+if __name__ == '__main__':
+    test_wordpiece_tokenizer()
--- a/third_party/icu4c/filter.json
+++ b/third_party/icu4c/filter.json
@ -0,0 +1,6 @@
+{
+  "strategy": "additive",
+  "featureFilters": {
+    "normalization": "include"
+  }
+}