Add WhitespaceTokenizer and UnicodeScriptTokenizer for nlp

add CaseFold, NormalizeUTF8

add RegexReplace

add RegexTokenizer

add BasicTokenizer

add WordpieceTokenizer

add BertTokenizer
This commit is contained in:
qianlong 2020-05-05 16:51:05 +08:00
parent ea37dc76f0
commit 4f16f036be
45 changed files with 2944 additions and 10 deletions

View File

@ -3057,6 +3057,587 @@ Software: tinyxml2 8.0.0
Copyright 2011, John Resig.
Copyright 2011, The Dojo Foundation.
Software: icu 67.1
Copyright (C) 2000-2004, International Business Machines Corporation
Copyright (C) 2002-2014, International Business Machines(C) Copyright IBM Corp. 1998-2011 - All Rights Reserved
Copyright (C) 2003-2008, International Business Machines
Copyright (C) 2005-2006, International Business Machines
Copyright (C) 2016 and later: Unicode, Inc. and others.
Copyright (c) 2001-2010 International Business Machines
Copyright (C) 2009, International Business Machines
Copyright (c) 2010-2015 International Business Machines Corporation and others. All rights reserved.
Copyright (C) 2002-2015, International Business Machines verbatim (minus copyright and #include) and copied together into this file.
Copyright (c) 1997-2014, International Business Machines Corporation and others. All Rights Reserved.
Copyright (c) 1997-2008, International Business Machines Corporation and
Copyright (c) 1997-2003, International Business Machines Corporation and
Copyright (c) 1996-2012, International Business Machines Corporation and
Copyright (c) 1997-2016, International Business Machines
Copyright (c) 1997-2013 International Business Machines
Copyright (c) 1997-2016, International Business Machines Corporation and
Copyright (c) 1997-2001, International Business Machines Corporation and
Copyright (c) 1997-2012, International Business Machines Corporation and
Copyright (c) 1997-2005, International Business Machines Corporation and
Copyright (c) 1997-2010, International Business Machines Corporation and
Copyright (c) 2011-2016, International Business Machines Corporation
Copyright (c) 1997-2009, International Business Machines Corporation and
Copyright (c) 1997-2002,2008, International Business Machines Corporation and
Copyright (c) 1997-2009,2014, International Business Machines
Copyright (C) 2000-2009, International Business Machines
Copyright (c) 1997-2015, International Business Machines Corporation and
Copyright (c) 1997-2013, International Business Machines Corporation and
Copyright (c) 2001-2016, International Business Machines Corporation and
Copyright (c) 1997-2016, International Business Machines Corporation
Copyright (c) 1997-2003, 2007-2009 International Business Machines Corporation and
Copyright (c) 2011-2014, International Business Machines Corporation
Copyright (c) 2003-2009, International Business Machines
Copyright (c) 2016, International Business Machines Corporation
Copyright (c) 1997-2004, International Business Machines Corporation and
Copyright (C) 2002-2016, International Business Machines
Copyright (C) 1998-2014, International Business Machines Corporation
Copyright (c) 2003-2013, International Business Machines Corporation and
Copyright (c) 2005-2016, International Business Machines Corporation and
Copyright (c) 1999-2013, International Business Machines Corporation and
Copyright (c) 2003-2015, International Business Machines Corporation and
Copyright (C) 2003-2016, International Business Machines
Copyright (C) 2003-2014, International Business Machines
Copyright (C) 2003, International Business Machines
Copyright (c) 1998-2016, International Business Machines Corporation and
Copyright (c) 2004-2015, International Business Machines Corporation and
Copyright (c) 2009-2016, International Business Machines Corporation and
Copyright (C) 2003-2012, International Business Machines
Copyright (c) 2000-2016, International Business Machines Corporation and
Copyright (C) 2001-2014, International Business Machines
Copyright (C) 2001-2016, International Business Machines
Copyright (c) 1997-2014, International Business Machines © 2017 and later: Unicode, Inc. and others.
Copyright (C) 2007-2016, International Business Machines © 2018 and later: Unicode, Inc. and others.
Copyright (c) 2015, International Business Machines Corporation
Copyright (c) 2014-2016, International Business Machines Corporation
Copyright (c) 2002-2016, International Business Machines
Copyright (c) 2001-2011,2015 International Business Machines
Copyright (c) 2001-2016 International Business Machines
Copyright (c) 2005-2013, International Business Machines Corporation and
Copyright (c) 1998-2014, International Business Machines Corporation and
Copyright (C) 1997-2016 International Business Machines
Copyright (C) 2009-2014, International Business Machines Corporation and
Copyright (c) 2002-2014, International Business Machines Corporation
Copyright (c) 2002-2007, International Business Machines Corporation
Copyright (C) 1996-2012, International Business Machines Corporation
Copyright (C) 1996-2008, International Business Machines Corporation
Copyright (C) 2007-2013, International Business Machines Corporation and
Copyright (C) 2008-2015, International Business Machines
Copyright (C) 2003-2013, International Business Machines Corporation and
Copyright (C) 2003-2013, International Business Machines Corporation
Copyright (C) 1997-2016, International Business Machines Corporation and
Copyright (C) 2001-2011, International Business Machines
Copyright (C) 2001-2008, International Business Machines
Copyright (C) 2003 - 2009, International Business Machines Corporation and
Copyright (C) 2003 - 2008, International Business Machines Corporation and
Copyright (C) 2007-2014, International Business Machines Corporation
Copyright (C) 2007-2013, International Business Machines Corporation
Copyright (C) 1997-2013, International Business Machines Corporation and
Copyright (C) 1996-2014, International Business Machines Corporation and
Copyright (C) 2010-2014, International Business Machines
Copyright (C) 2010-2015, International Business Machines
Copyright (C) 2013-2014, International Business Machines
Copyright (C) 1996-2015, International Business Machines
Copyright (C) 1996-2014, International Business Machines
Copyright (C) 2012-2015, International Business Machines
Copyright (C) 2012-2014, International Business Machines
Copyright (C) 2013-2015, International Business Machines
Copyright (C) 2013-2016, International Business Machines
Copyright (C) 1999-2016, International Business Machines
Copyright (C) 1999-2015, International Business Machines
Copyright (C) 1999-2014, International Business Machines
Copyright (C) 2015-2016, International Business Machines Corporation and others.
Copyright (C) 2003 - 2013, International Business Machines Corporation and
Copyright (C) 1999-2011, International Business Machines
Copyright (C) 2005-2016, International Business Machines
Copyright (C) 2005-2012, International Business Machines
Copyright (C) 2005-2015, International Business Machines
Copyright (C) 2005-2013, International Business Machines
Copyright (C) 2005-2014, International Business Machines
Copyright (c) 2004, International Business Machines
Copyright (c) 2004-2014 International Business Machines
Copyright (c) 2004-2014, International Business Machines
Copyright (C) 2013, International Business Machines Corporation
Copyright (C) 1997-2015, International Business Machines Corporation and
Copyright (C) 2016, International Business Machines
Copyright (c) IBM Corporation, 2000-2012. All rights reserved.
Copyright (c) IBM Corporation, 2000-2011. All rights reserved.
Copyright (c) IBM Corporation, 2000-2014. All rights reserved.
Copyright (c) IBM Corporation, 2000-2010. All rights reserved.
Copyright (c) IBM Corporation, 2000-2016. All rights reserved.
Copyright 2010 the V8 project authors. All rights reserved.
Copyright 2006-2008 the V8 project authors. All rights reserved.
Copyright 2012 the V8 project authors. All rights reserved.
Copyright (C) 2008-2016, International Business Machines Corporation and
Copyright (C) 2007-2016, International Business Machines Corporation and
Copyright (C) 2007-2012, International Business Machines Corporation and
Copyright (c) 2001-2011, International Business Machines
Copyright (c) 2001-2007, International Business Machines
Copyright (C) 2010-2014, International Business Machines Corporation and
Copyright (C) 1997-2010, International Business Machines Corporation and
Copyright (C) 1997-2012, International Business Machines Corporation and
Copyright (C) 2009-2015, International Business Machines Corporation and
Copyright (C) 2009-2012, International Business Machines Corporation and
Copyright (c) 2002-2012, International Business Machines Corporation
Copyright (c) 2002-2011, International Business Machines Corporation
Copyright (C) 2008-2013, International Business Machines Corporation and
Copyright (c) 2003-2008, International Business Machines
Copyright (C) 2003-2016, International Business Machines Corporation
Copyright (C) 2003-2014, International Business Machines Corporation
Copyright (C) 2003-2008, International Business Machines Corporation
Copyright (C) 2005-2008, International Business Machines
Copyright (C) 2003-2015, International Business Machines Corporation
Copyright (C) 2003-2009,2012,2016 International Business Machines Corporation and
Copyright (c) 2004-2016, International Business Machines © 2020 and later: Unicode, Inc. and others.
Copyright (C) 2007-2008, International Business Machines Corporation and
Copyright (C) 2001-2007, International Business Machines
Copyright (C) 1997-2012, International Business Machines
Copyright (C) 1997-2015, International Business Machines
Copyright (C) 2001-2010, International Business Machines
Copyright (c) 2000-2005, International Business Machines
Copyright (c) 2000-2007, International Business Machines © 2019 and later: Unicode, Inc. and others.
Copyright (C) 2010-2015, International Business Machines Corporation and
Copyright (C) 2015, International Business Machines Corporation and
Copyright (c) 2003-2013, International Business Machines
Copyright (C) 2001-2012, International Business Machines
Copyright (C) 2001-2011, International Business Machines Corporation
Copyright (C) 2014-2016, International Business Machines
Copyright (C) 1997-2015, International Business Machines Corporation
Copyright (C) 1999-2007, International Business Machines
Copyright (C) 1999-2007, International Business Machines Corporation
Copyright (C) 1999-2011, International Business Machines Corporation
Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
Copyright (C) 2002-2016 International Business Machines Corporation and others.
Copyright (C) 2002-2016, International Business Machines Corporation and others.
Copyright (C) 2002-2016 International Business Machines Corporation
Copyright (C) 2002-2015, International Business Machines Corporation and others.
Copyright (C) 2012 International Business Machines Corporation
Copyright (C) 2002-2015 International Business Machines Corporation
Copyright (C) 2004-2015, International Business Machines Corporation and others.
Copyright (C) 2003-2010, International Business Machines Corporation and others.
Copyright (c) 2008-2011, International Business Machines Corporation and
Copyright (c) 2008-2010, International Business Machines Corporation and
Copyright (C) 2014-2016, International Business Machines Corporation and
Copyright (C) 2013, International Business Machines Corporation and
Copyright (c) 2014, International Business Machines
Copyright (C) 2014, International Business Machines
Copyright (C) 2013, International Business Machines
Copyright (C) 2001-2008,2010 IBM and others. All rights reserved.
Copyright (C) 2010 , Yahoo! Inc.
Copyright (c) 1997-2011, International Business Machines Corporation and
Copyright (C) 2013-2014, International Business Machines Corporation and
Copyright (C) 2009-2013, International Business Machines Corporation and
Copyright (C) 1996-2012, International Business Machines Corporation and
Copyright (C) 2015, International Business Machines Corporation
Copyright (c) 2001-2012, International Business Machines Corporation
Copyright (C) 2001-2014 IBM and others. All rights reserved.
Copyright (C) 2008-2014, Google, International Business Machines Corporation and
Copyright (C) 2008, Google, International Business Machines Corporation and
Copyright (C) 2008-2015, Google, International Business Machines Corporation
Copyright (c) 2001-2014, International Business Machines
Copyright (c) 2002-2010, International Business Machines Corporation
Copyright (C) 2011-2015, International Business Machines Corporation and
Copyright (C) 2011-2016, International Business Machines Corporation and
Copyright (C) 2011-2012, International Business Machines Corporation and
Copyright (C) 1996-2016, International Business Machines
Copyright (C) 1998-2014, International Business Machines
Copyright (C) 2004-2016, International Business Machines
Copyright (C) 2010-2011, International Business Machines
Copyright (C) 2009-2015, International Business Machines
Copyright (C) 2015, International Business Machines
Copyright (C) 2012-2016, International Business Machines
Copyright (C) 1999-2012, International Business Machines
Copyright (C) 2001, International Business Machines
Copyright (C) 2013, International Business Machines Corporation and others.
Copyright (C) 2010-2012, International Business Machines
Copyright (C) 2004-2015, International Business Machines
Copyright (C) 2003-2006, International Business Machines
Copyright (C) 2013-2015, International Business Machines Corporation and others.
Copyright (C) 2001-2015 IBM and others. All rights reserved.
Copyright (C) 2008-2015, International Business Machines Corporation
Copyright (C) 2008-2016, International Business Machines
Copyright (C) 2008-2013, International Business Machines Corporation
Copyright (C) 2004-2012, International Business Machines Corporation and
Copyright (C) 1997-2009,2014 International Business Machines
Copyright (C) 2009-2011, International Business Machines Corporation and
Copyright (C) 2009-2016, International Business Machines Corporation and
Copyright (C) 2009-2013, International Business Machines
Copyright (C) 2008-2011, International Business Machines
Copyright (C) 2007-2014, International Business Machines Corporation and
Copyright (C) 2009-2010, International Business Machines Corporation and
Copyright (C) 2001-2016 International Business Machines Corporation
Copyright (c) 2002-2011, International Business Machines
Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved.
Copyright (c) 2013-2016 International Business Machines Corporation and others. All rights reserved.
Copyright (c) 2013-2015 International Business Machines Corporation and others. All rights reserved.
Copyright (c) 2007-2012, International Business Machines Corporation and
Copyright (c) 2007-2012, International Business Machines
Copyright (C) 2010, International Business Machines
Copyright (C) 1997-2011, International Business Machines
Copyright (C) 1997-2005, International Business Machines
Copyright (C) 2009-2011, International Business Machines
Copyright (C) 2003-2015, International Business Machines
Copyright (C) 2009-2016, International Business Machines
Copyright (C) 2008-2012, International Business Machines
Copyright (C) 2008, International Business Machines
Copyright (C) 2011-2014, International Business Machines
Copyright (C) 2011-2013, International Business Machines
Copyright (C) 2005, International Business Machines
Copyright (C) 1999-2013, International Business Machines
Copyright (C) 1998-2016, International Business Machines
Copyright (c) 2007-2014, International Business Machines Corporation and
Copyright (C) 2003-2013, International Business Machines
Copyright (c) 2007-2016, International Business Machines Corporation and
Copyright (c) 2008-2015, International Business Machines
Copyright (C) 1999-2010, International Business Machines
Copyright (C) 2000-2015, International Business Machines
Copyright (C) 2000-2011, International Business Machines
Copyright (C) 2000-2012, International Business Machines
Copyright (C) 2000-2010, International Business Machines
Copyright (C) 2004-2010, International Business Machines
Copyright (C) 2004-2005, International Business Machines
Copyright (c) 2013-2014, International Business Machines
Copyright (c) 1991-2013 Unicode, Inc. © 2019 Unicode®, Inc.
Copyright (C) 2018 and later: Unicode, Inc. and others.
Copyright (c) 2008-2013 International Business Machines
Copyright (C) 2002-2010, International Business Machines
Copyright (c) 2012-2015 International Business Machines © 2020 Unicode®, Inc.
Copyright (c) 2005-2013 IBM Corporation and others. All rights reserved
Copyright (c) 2011-2012, International Business Machines Corporation and
Copyright (C) 1998-2000, International Business Machines © 2017 Unicode®, Inc.
Copyright (c) 2007-2015 International Business Machines
Copyright (C) 2004-2006, International Business Machines
Copyright (C) 2003-2005, International Business Machines
Copyright (c) 1999-2014 International Business Machines
Copyright (c) 2003, International Business Machines
Copyright (C) 2014 International Business Machines
Copyright (c) 2001-2003 International Business Machines
Copyright (c) 2004-2011 International Business Machines
Copyright (C) 2015-2016, International Business Machines
Copyright (c) 2001-2015 International Business Machines
Copyright (C) 2003-2012, International Business Machines Corporation and COPYRIGHT AND PERMISSION NOTICE
Copyright (c) 2003 National Electronics and Computer Technology Center and others
Copyright (C) 2005-2010, International Business Machines
Copyright (c) 2007-2009 IBM Corporation and others. All rights reserved
Copyright (C) 2004-2016 International Business Machines
Copyright (C) 1998-2013, International Business Machines
Copyright (C) 1998-2010, International Business Machines
Copyright (c) 1999-2004, International Business Machines
Copyright (C) 2002-2006 International Business Machines Corporation
Copyright (C) 1999-2006, International Business Machines
Copyright (C) 2002-2016 IBM, Inc. All Rights Reserved.
Copyright (c) 2002-2006, International Business Machines(C) Copyright IBM Corp. 1998-2007 - All Rights Reserved
Copyright (C) 1999-2003, International Business Machines
Copyright (C) 1998-2006, International Business Machines Corporation and
Copyright (C) 1998-2003, International Business Machines Corporation and
Copyright (C) 2003 - 2008, International Business Machines
Copyright (C) 1999-2008, International Business Machines
Copyright (C) 1999-2001, International Business Machines
Copyright (C) 1999-2005, International Business Machines
Copyright (C) 2016 and later: Unicode, Inc. and others.
Copyright (c) 2001-2010 IBM Corporation and others. All Rights Reserved.
Copyright (C) 1998-2005, International Business Machines Corporation and
Copyright (C) 1998-2001, International Business Machines Corporation and
Copyright (c) 2002-2005, International Business Machines Corporation and others. All Rights Reserved.
Copyright (C) 2000-2014, International Business Machines
Copyright (C) 1996-2013, International Business Machines
Copyright (c) 2002-2006, International Business Machines Corporation and
Copyright (c) 2004-2010, International Business Machines Corporation and
Copyright (C) 2004-2011, International Business Machines
Copyright (c) 2002-2005, International Business Machines Corporation and
Copyright (c) 2002-2014, International Business Machines
Copyright (c) 1997-2012, International Business Machines
Copyright (c) 2002-2008, International Business Machines Corporation and others. All Rights Reserved.
Copyright (C) 2011-2013, Apple Inc.; Unicode, Inc.; and others. All Rights Reserved.
Copyright (C) 2011-2013, Apple Inc. and others. All Rights Reserved.
Copyright (c) 2005-2007,2010 Apple Inc., Unicode Inc.,and others. All Rights Reserved.
Copyright (c) 1999-2003, International Business Machines Corporation and
Copyright (c) 2003-2014, International Business Machines
Copyright (c) 2002-2010, International Business Machines Corporation and others. All Rights Reserved.
Copyright (c) 1999-2010, International Business Machines Corporation and
Copyright (c) 1999-2002, International Business Machines Corporation and
Copyright (C) 2002-2003, International Business Machines
Copyright (C) 2002, International Business Machines
Copyright (c) 2007, International Business Machines Corporation and
Copyright (C) 2007, International Business Machines
Copyright (C) 2001-2006, International Business Machines
Copyright (C) 2010-2014, International Business Machines Corporation and others.
Copyright (C) 2005-2016, International Business Machines Corporation and
Copyright (C) 2015-2016, International Business Machines Corporation and
Copyright (C) 2008-2012, International Business Machines Corporation
Copyright (c) 2006-2015 International Business Machines Corporation and others. All rights reserved.
Copyright (c) 2014-2015 International Business Machines Corporation and others. All rights reserved.
Copyright (C) 2002-2011, International Business Machines
Copyright (c) 2003-2010, International Business Machines Corporation and others. All Rights Reserved.
Copyright (C) 2012 IBM Corporation and Others. All Rights Reserved.
Copyright (C) 1998-2012, International Business Machines Corporation
Copyright (c) 2009, International Business Machines Corporation and
Copyright (C) The Internet Society (2002). All Rights Reserved.
Copyright (c) 2015, International Business Machines Corporation and
Copyright (c) 2002, International Business Machines Corporation and others. All Rights Reserved.
Copyright (C) 1998-2016, International Business Machines Corporation
Copyright (c) 2011-2016,International Business Machines
Copyright (C) 2012 International Business Machines Corporation and Others. All Rights Reserved.
Copyright (C) 2011, International Business Machines Corporation and others. All Rights Reserved.
Copyright (C) 2011, International Business Machines Corporation and others. All Rights Reserved.
Copyright (c) 2011-2012,International Business Machines
Copyright (c) 2007, International Business Machines Corporation and others. All Rights Reserved.
Copyright (C) 2007-2007, International Business Machines(C) Copyright IBM Corp. 1998-2014 - All Rights Reserved
Copyright (C) 1998-2002, International Business Machines
Copyright (c) 2001-2007, International Business Machines Corporation and others. All Rights Reserved.(C) Copyright IBM Corp. 1998-2013 - All Rights Reserved
Copyright (C) 1998-2015, International Business Machines
Copyright (C) 2001-2014 International Business Machines
Copyright (C) 2011-2016, International Business Machines
Copyright (C) 2011-2015, International Business Machines
Copyright (c) 1999-2014, International Business Machines Corporation and
Copyright (c) 1999-2009, International Business Machines Corporation and
Copyright (c) 2010,International Business Machines
Copyright (c) 2010-2016,International Business Machines
Copyright (c) 2002-2005, International Business Machines
Copyright (C) 2000-2003, International Business Machines
Copyright (c) 2008-2014, International Business Machines Corporation and
Copyright (C) 2001 - 2005, International Business Machines
Copyright (C) 2001-2005, International Business Machines
Copyright (C) 1995-2014, International Business Machines
Copyright (c) 2000-2004 IBM, Inc. and Others.
Copyright (c) 2002-2014, International Business Machines Corporation and
Copyright (c) 2007-2013, International Business Machines Corporation and
Copyright (c) 2002-2012, International Business Machines Corporation and
Copyright (C) 2002-2012, International Business Machines
Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others.
Copyright (c) 2002, International Business Machines Corporation and others. All Rights Reserved.
Copyright (C) 2009-2014, International Business Machines
Copyright (C) 2008, International Business Machines Corporation and others.
Copyright (C) 2000-2016, International Business Machines
Copyright (C) 2011-2014 International Business Machines
Copyright (C) 1997-2014, International Business Machines
Copyright (C) 1997-2013, International Business Machines
Copyright (c) 2004-2006, International Business Machines
Copyright (C) 1997-2016, International Business Machines
Copyright (C) 1997-2006, International Business Machines
Copyright (C) 1997-2011, International Business Machines Corporation and others.
Copyright (C) 1997-2013, International Business Machines Corporation and others.
Copyright (c) 2004-2015, International Business Machines
Copyright (C) 2009-2017, International Business Machines Corporation,Google, and others. All Rights Reserved.
Copyright (C) 1997-2016, International Business Machines Corporation and others.
Copyright (C) 2008-2015, International Business Machines Corporation and
Copyright (C) 1997-2015, International Business Machines Corporation and others.
Copyright (C) 2014-2016, International Business Machines Corporation and others.
Copyright (c) 2014-2016, International Business Machines
Copyright (C) 2001-2011 IBM and others. All rights reserved.
Copyright (C) 1996-2014, International Business Machines Corporation and others.
Copyright (C) 1996-2016, International Business Machines Corporation and
Copyright (C) 2009-2016, International Business Machines Corporation,
Copyright (C) 2009-2010, Google, International Business Machines Corporation and
Copyright (C) 2008-2014, Google, International Business Machines Corporation
Copyright (C) 1996-2015, International Business Machines Corporation and
Copyright (c) 1996-2015, International Business Machines Corporation and others.
Copyright (C) 2010-2012,2015 International Business Machines
Copyright (C) 2007-2015, International Business Machines
Copyright (C) 2013-2014, International Business Machines Corporation and others.
Copyright (C) 2010-2013, International Business Machines
Copyright (c) 2002-2005, International Business Machines Corporation
Copyright (C) 2001-2011,2014 IBM and others. All rights reserved.
Copyright (C) 2008-2016, International Business Machines Corporation
Copyright (C) 2004 - 2008, International Business Machines Corporation and
Copyright (C) 1997-2011,2014-2015 International Business Machines
Copyright (C) 2001-2003, International Business Machines
Copyright (C) 1999-2009, International Business Machines
Copyright (C) 2020 and later: Unicode, Inc. and others.
Copyright (c) 2002, International Business Machines Corporation and
Copyright (C) 2000-2008, International Business Machines
Copyright (C) 1998-2006, International Business Machines
Copyright (C) 1998-2001, International Business Machines Corporation
Copyright (C) 1998-2004, International Business Machines Corporation
Copyright (C) 2000, International Business Machines
Copyright (c) 1999-2016, International Business Machines Corporation and
Copyright (c) 2015, International Business Machines Corporation and others. All Rights Reserved.
Copyright (c) 1999-2012, International Business Machines Corporation and
Copyright (C) 1998-2011, International Business Machines
Copyright (C) 2008-2014, International Business Machines Corporation and
Copyright (C) 2003-2004, International Business Machines
Copyright (c) 2003-2005, International Business Machines Corporation and others. All Rights Reserved.
Copyright (C) 2002-2006 IBM, Inc. All Rights Reserved.
Copyright (C) 2004-2008, International Business Machines
Copyright (c) 2002-2016 International Business Machines Corporation and
Copyright (c) 2002-2015, International Business Machines Corporation and
Copyright (C) 2002-2016, International Business Machines Corporation
Copyright (c) 2002-2010,International Business Machines
Copyright (c) 2002-2014,International Business Machines
Copyright (c) 2002-2016,International Business Machines
Copyright (C) 2016 International Business Machines Corporation
Copyright © 2019 and later: Unicode, Inc. and others.
Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
Copyright (c) 2016 International Business Machines Corporation and others. All Rights Reserved.
Copyright (c) 2015-2016, International Business Machines Corporation and others. All Rights Reserved.
Copyright (c) 2005-2006, International Business Machines Corporation and
Copyright (c) 1997-2004, International Business Machines Corporation
Copyright (c) 2012-2016, International Business Machines Corporation
Copyright (c) 2012-2014, International Business Machines Corporation and
Copyright (c) 1997-2014, International Business Machines Corporation
Copyright (c) 1996-2016, International Business Machines Corporation and
Copyright (c) 2003-2013, International Business Machines Corporation
Copyright (c) 2003-2008, International Business Machines Corporation
Copyright (c) 1997-2015, International Business Machines Corporation
Copyright (c) 2002-2016, International Business Machines Corporation and
Copyright (c) 1997-2002, International Business Machines Corporation and
Copyright (C) 1996-2012, International Business Machines
Copyright (c) 1997-2013 International Business Machines Corporation and
Copyright (c) 2010-2012, International Business Machines Corporation and
Copyright (c) 1997-2011, International Business Machines Corporation
Copyright (c) 1997-2006, International Business Machines Corporation and
Copyright (c) 2008-2016 International Business Machines Corporation and
Copyright (c) 2008-2016, International Business Machines Corporation and
Copyright (c) 1997-2016 International Business Machines Corporation and
Copyright (c) 2007-2011, International Business Machines
Copyright (c) 2007-2010, International Business Machines
Copyright (C) 2001-2016, International Business Machines Corporation and
Copyright (C) 2001-2003, International Business Machines Corporation and
Copyright (C) 2003-2011, International Business Machines
Copyright (c) 1997-2007, International Business Machines Corporation and
Copyright (c) 1997-2015, International Business Machines
Copyright (C) 2004-2009, International Business Machines Corporation and
Copyright (C) 2004, International Business Machines Corporation and
Copyright (C) 1996-2009, International Business Machines Corporation and
Copyright (C) 1996-2006, International Business Machines Corporation and
Copyright (C) 2011-2013, International Business Machines Corporation
Copyright (C) 2000-2007, International Business Machines
Copyright (c) 2001, International Business Machines Corporation and
Copyright (C) 2012-2013, International Business Machines
Copyright (c) 2010-2016, International Business Machines Corporation and
Copyright (c) 2010-2016, International Business Machines Corporation
Copyright (c) 1997-2010, International Business Machines Corporation
Copyright (c) 1997-2003, International Business Machines
Copyright (C) 2014-2015, International Business Machines Corporation and
Copyright (c) 1997-2013, International Business Machines Corporation
Copyright (c) 1999-2016, International Business Machines
Copyright (c) 1999-2016 International Business Machines Corporation and
Copyright (c) 2016, International Business Machines Corporation and
Copyright (c) 2016, International Business Machines
Copyright (c) 2013-2016, International Business Machines Corporation
Copyright (c) 2013, International Business Machines Corporation
Copyright (C) 2013-2016, International Business Machines Corporation and
Copyright (c) 2001-2010, International Business Machines Corporation and
Copyright (C) 2014, International Business Machines Corporation and
Copyright (c) 1999-2015, International Business Machines Corporation and
Copyright (C) 2001-2016, International Business Machines orporation
Copyright (c) 2001-2008, International Business Machines Corporation and others
Copyright (C) 2003-2016, International Business Machines Corporation and
Copyright (c) 2004, International Business Machines Corporation
Copyright (C) 2001-2009, International Business Machines
Copyright (c) 2004,2011 International Business Machines
Copyright (c) 2004-2011, International Business Machines
Copyright (c) 2000-2016, International Business Machines Corporation
Copyright (c) 2001-2005, International Business Machines Corporation and
Copyright (C) 2001-2004, International Business Machines
Copyright (c) 2001-2009, International Business Machines
Copyright (c) 1997-2009, International Business Machines Corporation
Copyright (c) 1997-2013, International Business Machines
Copyright (c) 1997-2012, International Business Machines Corporation
Copyright (C) 2007-2015, International Business Machines Corporation and
Copyright (C) 2007-2011, International Business Machines Corporation and
Copyright (C) 2007, International Business Machines Corporation and
Copyright (c) 1998-2005, International Business Machines Corporation and
Copyright (c) 2002-2010, International Business Machines Corporation and
Copyright (C) 1999-2016 International Business Machines Corporation and
Copyright (c) 2004-2011, International Business Machines Corporation and
Copyright (c) 2002-2007, International Business Machines Corporation and
Copyright (C) 2003, International Business Machines Corporation and
Copyright (C) 2005-2011, International Business Machines
Copyright (C) 2011-2012, International Business Machines
Copyright (C) 2007-2012, International Business Machines
Copyright (C) 2006-2016, International Business Machines Corporation
Copyright (C) 2006-2012, International Business Machines Corporation and others.
Copyright 2007 Google Inc. All Rights Reserved.
Copyright (c) 2001-2015, International Business Machines
Copyright (C) 2006-2014, International Business Machines Corporation
Copyright (C) 2008, International Business Machines Corporation and
Copyright (C) 2009-2012, International Business Machines
Copyright (C) 2006 International Business Machines Corporation
Copyright (C) 2010-2016, International Business Machines Corporation and
Copyright (C) 2002-2014, International Business Machines Corporation and
Copyright (C) 2002-2005, International Business Machines Corporation and
Copyright (C) 2011, International Business Machines
Copyright (c) 2003-2010 International Business Machines
Copyright (C) 2003-2003, International Business Machines
Copyright (C) 1999-2016 International Business Machines Corporation
Copyright (C) 1999-2014 International Business Machines Corporation
Copyright (C) 1999-2014 International Business Machines
Copyright (C) 2002-2011, International Business Machines Corporation and others.
Copyright (C) 2002-2008, International Business Machines Corporation and others.
Copyright (C) 2002-2008 International Business Machines Corporation
Copyright (c) 2001-2005, International Business Machines
Copyright (C) 2002-2014 International Business Machines Corporation
Copyright (c) 2003-2011, International Business Machines
Copyright (C) 1998-2012, International Business Machines Corporation and
Copyright (C) 2001-2014, International Business Machines Corporation.
Copyright (C) 2001-2011, International Business Machines Corporation.
Copyright (C) 2001-2014, International Business Machines Corporation and
Copyright (C) 2001-2011, International Business Machines Corporation and
Copyright (C) 2001-2012, International Business Machines Corporation and
Copyright 2004 and onwards Google Inc.
Copyright (C) 2004-2014, International Business Machines
Copyright (C) 2006, International Business Machines
Copyright (C) 2004-2012, International Business Machines
Copyright (C) 2001-2013, International Business Machines
Copyright (C) 1998-2004, International Business Machines
Copyright (C) 2000-2013, International Business Machines
Copyright (C) 1999-2015 International Business Machines
Copyright (C) 2000-2006, International Business Machines
Copyright (C) 1999-2004, International Business Machines
Copyright (C) 2003-2007, International Business Machines
Copyright (C) 2002-2006, International Business Machines
Copyright (C) 2001-2015, International Business Machines
Copyright (c) 2001-2012, International Business Machines
Copyright (c) 2002-2004, International Business Machines
Copyright (C) 1999-2016, International Business Machines Corporation and
Copyright (c) 1996-2014, International Business Machines
Copyright (C) 1999-2016, International Business Machines Corporation
Copyright (C) 2009-2014 International Business Machines
Copyright (C) 2004-2007, International Business Machines
Copyright (c) 2001-2016, International Business Machines
Copyright (C) 2003-2009, International Business Machines
Copyright (C) 1999-2013, International Business Machines Corporation and
Copyright (C) 1999-2015, International Business Machines Corporation and
Copyright (c) 2002-2011, International Business Machines Corporation and others. All Rights Reserved.
Copyright (C) 2001-2016 IBM, Inc. All Rights Reserved.
Copyright (C) 1999-2016 International Business Machines
Copyright (C) 2009-2010 IBM Corporation and Others. All Rights Reserved.
Copyright (C) 1998-2012, International Business Machines
Copyright (C) 1991 and later: Unicode, Inc. and others.
Copyright (C) 1997-2000, International Business Machines
Copyright (c) 1999-2007, International Business Machines Corporation and
Copyright (c) 2000 IBM, Inc. and Others.
Copyright (C) 2008-2013, International Business Machines
Copyright (C) 1998-2003, 2006, International Business Machines Corporation
Copyright (c) 2002-2003,International Business Machines
Copyright (C) 2009 International Business Machines
Copyright (C) 2010-2016 International Business Machines
Copyright (C) 2008-2012 IBM, Inc. All Rights Reserved.
Copyright (C) 1998-2008, International Business Machines
Copyright (C) 2010-2016, International Business Machines
Copyright (C) 1999-2006,2013 IBM Corp. All rights reserved.
Copyright (C) 2008-2009, International Business Machines Corporation and
Copyright (C) 2012,2014 International Business Machines
Copyright (c) 1996-2015, International Business Machines Corporation and
Copyright (C) 1997-2005, International Business Machines Corporation and others. All Rights Reserved.
Copyright (C) 1999-2012, International Business Machines Corporation and
Copyright (C) 1996-2013, International Business Machines Corporation
Copyright (C) 1998-2005, International Business Machines
Copyright 2001 and onwards Google Inc.
Copyright (C) 2010-2012,2014, International Business Machines
Copyright (C) 1996-2015, International Business Machines Corporation and others.
Copyright (c) 2003-2004, International Business Machines
Copyright (C) 2000-2004, International Business Machines
Copyright (C) 2002-2013, International Business Machines
Copyright (C) 2002-2011 International Business Machines Corporation and others. All Rights Reserved.
Copyright (C) 1999-2010, International Business Machines Corporation and others.
Copyright (C) 2001-2005, International Business Machines Corporation and others. All Rights Reserved.
Copyright (c) 1996-2016, International Business Machines Corporation
Copyright (C) 1997-2010, International Business Machines
Software: opencv 4.2.0
Copyright notice:
Copyright (C) 2016, NVIDIA Corporation, all rights reserved.

View File

@ -0,0 +1,19 @@
set(LIB_ICU_COMMON icuuc)
set(LIB_ICU_DATA icudata)
set(LIB_ICU_I18N icui18n)
if (CMAKE_SYSTEM_NAME MATCHES "Windows")
message("icu4c thirdparty do not support windows currently.")
else()
mindspore_add_pkg(icu4c
VER 67.1
LIBS ${LIB_ICU_COMMON} ${LIB_ICU_DATA} ${LIB_ICU_I18N}
URL https://github.com/unicode-org/icu/archive/release-67-1.tar.gz
MD5 0c2662a2b0bc80b0eb56495205247c8f
CONFIGURE_COMMAND ./icu4c/source/runConfigureICU Linux --enable-tests=no --enable-samples=no --enable-icuio=no --enable-extras=no ICU_DATA_FILTER_FILE=${CMAKE_SOURCE_DIR}/third_party/icu4c/filter.json
)
include_directories(${icu4c_INC})
add_library(mindspore::icuuc ALIAS icu4c::${LIB_ICU_COMMON})
add_library(mindspore::icudata ALIAS icu4c::${LIB_ICU_DATA})
add_library(mindspore::icui18n ALIAS icu4c::${LIB_ICU_I18N})
add_definitions(-D ENABLE_ICU4C)
endif()

View File

@ -54,6 +54,7 @@ elseif(ENABLE_D OR ENABLE_TESTCASES)
endif()
if (ENABLE_MINDDATA)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/icu4c.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/jpeg_turbo.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/libtiff.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/opencv.cmake)

View File

@ -91,7 +91,20 @@ if (ENABLE_MINDDATA)
DESTINATION ${INSTALL_LIB_DIR}
COMPONENT mindspore
)
if (CMAKE_SYSTEM_NAME MATCHES "Windows")
message("icu4c does not support windows system temporarily")
else()
file(GLOB_RECURSE ICU4C_LIB_LIST
${icu4c_LIBPATH}/libicuuc*
${icu4c_LIBPATH}/libicudata*
${icu4c_LIBPATH}/libicui18n*
)
install(
FILES ${ICU4C_LIB_LIST}
DESTINATION ${INSTALL_LIB_DIR}
COMPONENT mindspore
)
endif()
endif ()
if (ENABLE_CPU)

View File

@ -108,10 +108,11 @@ target_link_libraries(_c_dataengine PRIVATE mindspore mindspore_gvar)
if (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
target_link_libraries(_c_dataengine PRIVATE mindspore::pybind11_module ${PYTHON_LIBRARIES} mindspore::protobuf ${SECUREC_LIBRARY})
else()
set(ICU_LIB mindspore::icuuc mindspore::icudata mindspore::icui18n)
target_link_libraries(_c_dataengine PRIVATE mindspore::pybind11_module -ldl mindspore::protobuf ${SECUREC_LIBRARY})
endif()
target_link_libraries(_c_dataengine PUBLIC mindspore::jpeg_turbo mindspore::opencv_core mindspore::opencv_imgcodecs
mindspore::opencv_imgproc mindspore::tinyxml2)
mindspore::opencv_imgproc mindspore::tinyxml2 ${ICU_LIB})
if (ENABLE_GPUQUE)
target_link_libraries(_c_dataengine PRIVATE gpu_queue
${CUDNN_PATH}/lib64/libcudnn.so

View File

@ -65,8 +65,21 @@
#include "dataset/text/kernels/jieba_tokenizer_op.h"
#include "dataset/text/kernels/ngram_op.h"
#include "dataset/text/kernels/unicode_char_tokenizer_op.h"
#include "dataset/text/kernels/wordpiece_tokenizer_op.h"
#include "dataset/text/vocab.h"
#include "dataset/text/kernels/lookup_op.h"
#ifdef ENABLE_ICU4C
#include "dataset/text/kernels/basic_tokenizer_op.h"
#include "dataset/text/kernels/bert_tokenizer_op.h"
#include "dataset/text/kernels/case_fold_op.h"
#include "dataset/text/kernels/normalize_utf8_op.h"
#include "dataset/text/kernels/regex_replace_op.h"
#include "dataset/text/kernels/regex_tokenizer_op.h"
#include "dataset/text/kernels/unicode_script_tokenizer_op.h"
#include "dataset/text/kernels/whitespace_tokenizer_op.h"
#endif
#include "dataset/util/random.h"
#include "mindrecord/include/shard_operator.h"
#include "mindrecord/include/shard_pk_sample.h"
@ -485,7 +498,7 @@ void bindTensorOps4(py::module *m) {
py::arg("fillR") = PadOp::kDefFillR, py::arg("fillG") = PadOp::kDefFillG, py::arg("fillB") = PadOp::kDefFillB);
}
void bindTensorOps5(py::module *m) {
void bindTokenizerOps(py::module *m) {
(void)py::class_<JiebaTokenizerOp, TensorOp, std::shared_ptr<JiebaTokenizerOp>>(*m, "JiebaTokenizerOp", "")
.def(py::init<const std::string, std::string, JiebaMode>(), py::arg("hmm_path"), py::arg("mp_path"),
py::arg("mode") = JiebaMode::kMix)
@ -503,6 +516,55 @@ void bindTensorOps5(py::module *m) {
const std::string &>(),
py::arg("ngrams"), py::arg("l_pad_len"), py::arg("r_pad_len"), py::arg("l_pad_token"), py::arg("r_pad_token"),
py::arg("separator"));
(void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>(
*m, "WordpieceTokenizerOp", "Tokenize scalar token or 1-D tokens to subword tokens.")
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &>(),
py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken));
}
void bindDependIcuTokenizerOps(py::module *m) {
#ifdef ENABLE_ICU4C
(void)py::class_<WhitespaceTokenizerOp, TensorOp, std::shared_ptr<WhitespaceTokenizerOp>>(
*m, "WhitespaceTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces.")
.def(py::init<>());
(void)py::class_<UnicodeScriptTokenizerOp, TensorOp, std::shared_ptr<UnicodeScriptTokenizerOp>>(
*m, "UnicodeScriptTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.")
.def(py::init<>())
.def(py::init<bool>(), py::arg("keep_whitespace") = UnicodeScriptTokenizerOp::kDefKeepWhitespace);
(void)py::class_<CaseFoldOp, TensorOp, std::shared_ptr<CaseFoldOp>>(
*m, "CaseFoldOp", "Apply case fold operation on utf-8 string tensor")
.def(py::init<>());
(void)py::class_<NormalizeUTF8Op, TensorOp, std::shared_ptr<NormalizeUTF8Op>>(
*m, "NormalizeUTF8Op", "Apply normalize operation on utf-8 string tensor.")
.def(py::init<>())
.def(py::init<NormalizeForm>(), py::arg("normalize_form") = NormalizeUTF8Op::kDefNormalizeForm);
(void)py::class_<RegexReplaceOp, TensorOp, std::shared_ptr<RegexReplaceOp>>(
*m, "RegexReplaceOp", "Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'.")
.def(py::init<const std::string &, const std::string &, bool>(), py::arg("pattern"), py::arg("replace"),
py::arg("replace_all"));
(void)py::class_<RegexTokenizerOp, TensorOp, std::shared_ptr<RegexTokenizerOp>>(
*m, "RegexTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by regex expression pattern.")
.def(py::init<const std::string &, const std::string &>(), py::arg("delim_pattern"), py::arg("keep_delim_pattern"));
(void)py::class_<BasicTokenizerOp, TensorOp, std::shared_ptr<BasicTokenizerOp>>(
*m, "BasicTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by specific rules.")
.def(py::init<bool, bool, NormalizeForm, bool>(), py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken);
(void)py::class_<BertTokenizerOp, TensorOp, std::shared_ptr<BertTokenizerOp>>(*m, "BertTokenizerOp",
"Tokenizer used for Bert text process.")
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, bool, bool,
NormalizeForm, bool>(),
py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken),
py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken);
#endif
}
void bindSamplerOps(py::module *m) {
@ -715,6 +777,16 @@ PYBIND11_MODULE(_c_dataengine, m) {
.value("DE_JIEBA_HMM", JiebaMode::kHmm)
.export_values();
#ifdef ENABLE_ICU4C
(void)py::enum_<NormalizeForm>(m, "NormalizeForm", py::arithmetic())
.value("DE_NORMALIZE_NONE", NormalizeForm::kNone)
.value("DE_NORMALIZE_NFC", NormalizeForm::kNfc)
.value("DE_NORMALIZE_NFKC", NormalizeForm::kNfkc)
.value("DE_NORMALIZE_NFD", NormalizeForm::kNfd)
.value("DE_NORMALIZE_NFKD", NormalizeForm::kNfkd)
.export_values();
#endif
(void)py::enum_<InterpolationMode>(m, "InterpolationMode", py::arithmetic())
.value("DE_INTER_LINEAR", InterpolationMode::kLinear)
.value("DE_INTER_CUBIC", InterpolationMode::kCubic)
@ -734,12 +806,13 @@ PYBIND11_MODULE(_c_dataengine, m) {
bindTensorOps2(&m);
bindTensorOps3(&m);
bindTensorOps4(&m);
bindTensorOps5(&m);
bindTokenizerOps(&m);
bindSamplerOps(&m);
bindDatasetOps(&m);
bindInfoObjects(&m);
bindVocabObjects(&m);
bindGraphData(&m);
bindDependIcuTokenizerOps(&m);
}
} // namespace dataset
} // namespace mindspore

View File

@ -1,8 +1,21 @@
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
if (NOT (CMAKE_SYSTEM_NAME MATCHES "Windows"))
set(ICU_DEPEND_FILES
basic_tokenizer_op.cc
bert_tokenizer_op.cc
case_fold_op.cc
normalize_utf8_op.cc
regex_replace_op.cc
regex_tokenizer_op.cc
unicode_script_tokenizer_op.cc
whitespace_tokenizer_op.cc)
endif()
add_library(text-kernels OBJECT
lookup_op.cc
jieba_tokenizer_op.cc
unicode_char_tokenizer_op.cc
ngram_op.cc
wordpiece_tokenizer_op.cc
${ICU_DEPEND_FILES}
)

View File

@ -0,0 +1,93 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/text/kernels/basic_tokenizer_op.h"
#include <memory>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
namespace mindspore {
namespace dataset {
const bool BasicTokenizerOp::kDefLowerCase = false;
const bool BasicTokenizerOp::kDefKeepWhitespace = false;
const NormalizeForm BasicTokenizerOp::kDefNormalizationForm = NormalizeForm::kNone;
const bool BasicTokenizerOp::kDefPreserveUnusedToken = true;
const char BasicTokenizerOp::kCommonPattern[] =
"[!-/]"
"|[:-@]"
"|[\\[-`]"
"|[{-~]"
"|[\\p{P}]"
"|[\\x{4E00}-\\x{9FFF}]"
"|[\\x{3400}-\\x{4DBF}]"
"|[\\x{20000}-\\x{2A6DF}]"
"|[\\x{2A700}-\\x{2B73F}]"
"|[\\x{2B740}-\\x{2B81F}]"
"|[\\x{2B820}-\\x{2CEAF}]"
"|[\\x{F900}-\\x{FAFF}]"
"|[\\x{2F800}-\\x{2FA1F}]";
const char BasicTokenizerOp::kUnusedPattern[] = "\\[CLS\\]|\\[SEP\\]|\\[UNK\\]|\\[PAD\\]|\\[MASK\\]|";
BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, NormalizeForm normalization_form,
bool preserve_unused_token)
: lower_case_(lower_case),
keep_whitespace_(keep_whitespace),
preserve_unused_token_(preserve_unused_token),
case_fold_(std::make_unique<CaseFoldOp>()),
nfd_normalize_(std::make_unique<NormalizeUTF8Op>(NormalizeForm::kNfd)),
common_normalize_(std::make_unique<NormalizeUTF8Op>(normalization_form)),
replace_accent_chars_(std::make_unique<RegexReplaceOp>("\\p{Mn}", "")),
replace_control_chars_(std::make_unique<RegexReplaceOp>("\\p{Cc}|\\p{Cf}", " ")) {
std::string delim_pattern = std::string("\\s+|") + kCommonPattern;
std::string keep_delim_pattern;
if (keep_whitespace_) {
keep_delim_pattern = delim_pattern;
} else {
keep_delim_pattern = kCommonPattern;
}
if (preserve_unused_token_) {
keep_delim_pattern = kUnusedPattern + keep_delim_pattern;
delim_pattern = kUnusedPattern + delim_pattern;
}
regex_tokenizer_ = std::make_unique<RegexTokenizerOp>(delim_pattern, keep_delim_pattern);
}
Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
}
std::shared_ptr<Tensor> cur_input;
std::shared_ptr<Tensor> processed_tensor;
if (lower_case_) {
// to lower case
RETURN_IF_NOT_OK(case_fold_->Compute(input, &processed_tensor));
cur_input = processed_tensor;
// strip accent characters
RETURN_IF_NOT_OK(nfd_normalize_->Compute(cur_input, &processed_tensor));
cur_input = processed_tensor;
RETURN_IF_NOT_OK(replace_accent_chars_->Compute(cur_input, &processed_tensor));
} else {
RETURN_IF_NOT_OK(common_normalize_->Compute(input, &processed_tensor));
}
// strip control characters
cur_input = processed_tensor;
RETURN_IF_NOT_OK(replace_control_chars_->Compute(cur_input, &processed_tensor));
return regex_tokenizer_->Compute(processed_tensor, output);
}
} // namespace dataset
} // namespace mindspore

View File

@ -0,0 +1,64 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
#define DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
#include <memory>
#include <string>
#include "dataset/core/tensor.h"
#include "dataset/kernels/tensor_op.h"
#include "dataset/text/kernels/case_fold_op.h"
#include "dataset/text/kernels/normalize_utf8_op.h"
#include "dataset/text/kernels/regex_replace_op.h"
#include "dataset/text/kernels/regex_tokenizer_op.h"
#include "dataset/util/status.h"
namespace mindspore {
namespace dataset {
class BasicTokenizerOp : public TensorOp {
public:
static const bool kDefLowerCase;
static const bool kDefKeepWhitespace;
static const NormalizeForm kDefNormalizationForm;
static const bool kDefPreserveUnusedToken;
BasicTokenizerOp(bool lower_case = kDefLowerCase, bool keep_whitespace = kDefKeepWhitespace,
NormalizeForm normalization_form = kDefNormalizationForm,
bool preserve_unused_token = kDefPreserveUnusedToken);
~BasicTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "BasicTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
private:
static const char kCommonPattern[];
static const char kUnusedPattern[];
bool lower_case_;
bool keep_whitespace_;
NormalizeForm normalization_form_;
bool preserve_unused_token_;
std::unique_ptr<CaseFoldOp> case_fold_;
std::unique_ptr<NormalizeUTF8Op> nfd_normalize_;
std::unique_ptr<NormalizeUTF8Op> common_normalize_;
std::unique_ptr<RegexReplaceOp> replace_accent_chars_;
std::unique_ptr<RegexReplaceOp> replace_control_chars_;
std::unique_ptr<RegexTokenizerOp> regex_tokenizer_;
};
} // namespace dataset
} // namespace mindspore
#endif // DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_

View File

@ -0,0 +1,27 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/text/kernels/bert_tokenizer_op.h"
namespace mindspore {
namespace dataset {
Status BertTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
std::shared_ptr<Tensor> basic_tensor;
RETURN_IF_NOT_OK(basic_tokenizer_.Compute(input, &basic_tensor));
RETURN_IF_NOT_OK(wordpiece_tokenizer_.Compute(basic_tensor, output));
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -0,0 +1,54 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_
#define DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_
#include <memory>
#include <string>
#include "dataset/core/tensor.h"
#include "dataset/kernels/tensor_op.h"
#include "dataset/text/kernels/basic_tokenizer_op.h"
#include "dataset/text/kernels/wordpiece_tokenizer_op.h"
#include "dataset/util/status.h"
namespace mindspore {
namespace dataset {
class BertTokenizerOp : public TensorOp {
public:
BertTokenizerOp(const std::shared_ptr<Vocab> &vocab,
const std::string &suffix_indicator = WordpieceTokenizerOp::kDefSuffixIndicator,
const int &max_bytes_per_token = WordpieceTokenizerOp::kDefMaxBytesPerToken,
const std::string &unknown_token = WordpieceTokenizerOp::kDefUnknownToken,
bool lower_case = BasicTokenizerOp::kDefLowerCase,
bool keep_whitespace = BasicTokenizerOp::kDefKeepWhitespace,
NormalizeForm normalization_form = BasicTokenizerOp::kDefNormalizationForm,
bool preserve_unused_token = BasicTokenizerOp::kDefPreserveUnusedToken)
: wordpiece_tokenizer_(vocab, suffix_indicator, max_bytes_per_token, unknown_token),
basic_tokenizer_(lower_case, keep_whitespace, normalization_form, preserve_unused_token) {}
~BertTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "BertTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
private:
WordpieceTokenizerOp wordpiece_tokenizer_;
BasicTokenizerOp basic_tokenizer_;
};
} // namespace dataset
} // namespace mindspore
#endif // DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_

View File

@ -0,0 +1,46 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/text/kernels/case_fold_op.h"
#include <memory>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
#include "unicode/errorcode.h"
#include "unicode/normalizer2.h"
#include "unicode/utypes.h"
namespace mindspore {
namespace dataset {
Status CaseFoldOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
icu::ErrorCode error;
const icu::Normalizer2 *nfkc_case_fold = icu::Normalizer2::getNFKCCasefoldInstance(error);
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKCCasefoldInstance failed.");
std::vector<std::string> strs(input->Size());
int i = 0;
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
icu::StringByteSink<std::string> sink(&strs[i++]);
nfkc_case_fold->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error);
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "normalizeUTF8 failed.");
}
*output = std::make_shared<Tensor>(std::move(strs), input->shape());
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -0,0 +1,39 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_
#define DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_
#include <memory>
#include "dataset/core/tensor.h"
#include "dataset/kernels/tensor_op.h"
#include "dataset/util/status.h"
namespace mindspore {
namespace dataset {
class CaseFoldOp : public TensorOp {
public:
CaseFoldOp() {}
~CaseFoldOp() override = default;
void Print(std::ostream &out) const override { out << "CaseFoldOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
};
} // namespace dataset
} // namespace mindspore
#endif // DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_

View File

@ -29,6 +29,7 @@ JiebaTokenizerOp::JiebaTokenizerOp(const std::string &hmm_path, const std::strin
}
Status JiebaTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
RETURN_UNEXPECTED_IF_NULL(jieba_parser_);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {

View File

@ -24,6 +24,7 @@ LookupOp::LookupOp(std::shared_ptr<Vocab> vocab, WordIdType default_id)
: vocab_(vocab), default_id_(default_id), type_(DataType("int32")) {}
Status LookupOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
RETURN_UNEXPECTED_IF_NULL(vocab_);
CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "None String Tensor");
std::vector<WordIdType> word_ids;

View File

@ -34,6 +34,7 @@ NgramOp::NgramOp(const std::vector<int32_t> &ngrams, int32_t l_len, int32_t r_le
separator_(separator) {}
Status NgramOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING && input->Rank() == 1, "Not a 1-D str Tensor");
std::vector<int32_t> offsets; // offsets for each str
std::vector<std::string> res; // holds the result of ngrams

View File

@ -0,0 +1,75 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/text/kernels/normalize_utf8_op.h"
#include <memory>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
#include "unicode/errorcode.h"
#include "unicode/normalizer2.h"
#include "unicode/utypes.h"
namespace mindspore {
namespace dataset {
const NormalizeForm NormalizeUTF8Op::kDefNormalizeForm = NormalizeForm::kNfkc;
Status NormalizeUTF8Op::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
icu::ErrorCode error;
const icu::Normalizer2 *normalize = nullptr;
switch (normalize_form_) {
case NormalizeForm::kNone: {
*output = input;
return Status::OK();
}
case NormalizeForm::kNfc: {
normalize = icu::Normalizer2::getNFCInstance(error);
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFCInstance failed");
break;
}
case NormalizeForm::kNfkc: {
normalize = icu::Normalizer2::getNFKCInstance(error);
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKCInstance failed");
break;
}
case NormalizeForm::kNfd: {
normalize = icu::Normalizer2::getNFDInstance(error);
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFDInstance failed");
break;
}
case NormalizeForm::kNfkd: {
normalize = icu::Normalizer2::getNFKDInstance(error);
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKDInstance failed");
break;
}
default: {
RETURN_STATUS_UNEXPECTED("unexpected normalize form");
break;
}
}
std::vector<std::string> strs(input->Size());
int i = 0;
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
icu::StringByteSink<std::string> sink(&strs[i++]);
normalize->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error);
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "normalizeUTF8 failed.");
}
*output = std::make_shared<Tensor>(std::move(strs), input->shape());
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -0,0 +1,50 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_
#define DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_
#include <memory>
#include "dataset/core/tensor.h"
#include "dataset/kernels/tensor_op.h"
#include "dataset/util/status.h"
namespace mindspore {
namespace dataset {
enum class NormalizeForm {
kNone = 0,
kNfc,
kNfkc,
kNfd,
kNfkd,
};
class NormalizeUTF8Op : public TensorOp {
public:
static const NormalizeForm kDefNormalizeForm;
explicit NormalizeUTF8Op(NormalizeForm normalize_form = kDefNormalizeForm) : normalize_form_(normalize_form) {}
~NormalizeUTF8Op() override = default;
void Print(std::ostream &out) const override { out << "NormalizeUTF8Op"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
private:
NormalizeForm normalize_form_;
};
} // namespace dataset
} // namespace mindspore
#endif // DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_

View File

@ -0,0 +1,57 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/text/kernels/regex_replace_op.h"
#include <memory>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
namespace mindspore {
namespace dataset {
Status RegexReplaceOp::RegexReplace(icu::RegexMatcher *const matcher, const std::string_view &text,
std::string *out) const {
CHECK_FAIL_RETURN_UNEXPECTED((matcher != nullptr && out != nullptr), "Input is null");
UErrorCode icu_error = U_ZERO_ERROR;
icu::UnicodeString unicode_text = icu::UnicodeString::fromUTF8(text);
matcher->reset(unicode_text);
icu::UnicodeString unicode_out;
if (replace_all_) {
unicode_out = matcher->replaceAll(replace_, icu_error);
} else {
unicode_out = matcher->replaceFirst(replace_, icu_error);
}
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "RegexReplace failed");
unicode_out.toUTF8String(*out);
return Status::OK();
}
Status RegexReplaceOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
UErrorCode icu_error = U_ZERO_ERROR;
icu::RegexMatcher matcher(pattern_, 0, icu_error);
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "Create icu RegexMatcher failed, you may input one error pattern");
std::vector<std::string> strs(input->Size());
int i = 0;
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
RETURN_IF_NOT_OK(RegexReplace(&matcher, *iter, &strs[i]));
}
*output = std::make_shared<Tensor>(std::move(strs), input->shape());
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -0,0 +1,55 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_
#define DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_
#include <memory>
#include <string>
#include "unicode/regex.h"
#include "unicode/errorcode.h"
#include "unicode/utypes.h"
#include "dataset/core/tensor.h"
#include "dataset/kernels/tensor_op.h"
#include "dataset/util/status.h"
namespace mindspore {
namespace dataset {
class RegexReplaceOp : public TensorOp {
public:
RegexReplaceOp(const std::string &pattern, const std::string &replace, bool replace_all = true)
: pattern_(icu::UnicodeString::fromUTF8(pattern)),
replace_(icu::UnicodeString::fromUTF8(replace)),
replace_all_(replace_all) {}
~RegexReplaceOp() override = default;
void Print(std::ostream &out) const override { out << "RegexReplaceOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
protected:
Status RegexReplace(icu::RegexMatcher *const matcher, const std::string_view &text, std::string *out) const;
private:
const icu::UnicodeString pattern_;
const icu::UnicodeString replace_;
const bool replace_all_;
};
} // namespace dataset
} // namespace mindspore
#endif // DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_

View File

@ -0,0 +1,103 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/text/kernels/regex_tokenizer_op.h"
#include <memory>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
namespace mindspore {
namespace dataset {
Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8,
icu::UnicodeString *out_unicode) const {
CHECK_FAIL_RETURN_UNEXPECTED((out_utf8 != nullptr || out_unicode != nullptr), "Wrong input");
int total_len = input.length();
int end = start + len;
CHECK_FAIL_RETURN_UNEXPECTED((start >= 0 && len > 0 && end <= total_len), "Out of range");
icu::UnicodeString temp;
input.extract(start, len, temp);
if (out_utf8 != nullptr) {
temp.toUTF8String(*out_utf8);
}
if (out_unicode != nullptr) {
*out_unicode = temp;
}
return Status::OK();
}
Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const {
UErrorCode status = U_ZERO_ERROR;
out_tokens->clear();
icu::RegexMatcher token_matcher(delim_pattern_, 0, status);
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Create icu RegexMatcher failed, you may input one error pattern");
icu::RegexMatcher delim_matcher(keep_delim_pattern_, 0, status);
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Create icu RegexMatcher failed, you may input one error pattern");
icu::UnicodeString utext(icu::UnicodeString::fromUTF8(text));
token_matcher.reset(utext);
int token_start_index = 0;
status = U_ZERO_ERROR;
while (token_matcher.find(status) && U_SUCCESS(status)) {
int deli_start_index = token_matcher.start(status);
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Get RegexMatcher matched start index failed");
int deli_end_index = token_matcher.end(status);
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Get RegexMatcher matched start index failed");
// Add non-empty token
int token_len = deli_start_index - token_start_index;
if (token_len > 0) {
std::string token;
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, token_len, &token));
out_tokens->emplace_back(std::move(token));
}
int delim_len = deli_end_index - deli_start_index;
if (keep_delim_ && delim_len > 0) {
icu::UnicodeString delim_str;
std::string delim_utf8_str;
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, deli_start_index, delim_len, &delim_utf8_str, &delim_str));
delim_matcher.reset(delim_str);
if (delim_matcher.matches(status) && U_SUCCESS(status)) {
out_tokens->emplace_back(std::move(delim_utf8_str));
}
}
token_start_index = deli_end_index;
}
if (token_start_index < utext.length()) {
std::string temp;
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, utext.length() - token_start_index, &temp));
out_tokens->emplace_back(std::move(temp));
}
return Status::OK();
}
Status RegexTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
}
std::string_view text;
RETURN_IF_NOT_OK(input->GetItemAt(&text, {}));
std::vector<std::string> tokens;
RETURN_IF_NOT_OK(GetRegexTokens(std::string(text.data(), text.size()), &tokens));
*output = std::make_shared<Tensor>(std::move(tokens), TensorShape({(dsize_t)tokens.size()}));
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -0,0 +1,58 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATASET_TEXT_REGEX_TOKENIZER_OP_H_
#define DATASET_TEXT_REGEX_TOKENIZER_OP_H_
#include <memory>
#include <string>
#include <vector>
#include "unicode/regex.h"
#include "unicode/errorcode.h"
#include "unicode/utypes.h"
#include "dataset/core/tensor.h"
#include "dataset/kernels/tensor_op.h"
#include "dataset/util/status.h"
namespace mindspore {
namespace dataset {
class RegexTokenizerOp : public TensorOp {
public:
RegexTokenizerOp(const std::string &delim_pattern, const std::string &keep_delim_pattern)
: delim_pattern_(icu::UnicodeString::fromUTF8(delim_pattern)),
keep_delim_pattern_(icu::UnicodeString::fromUTF8(keep_delim_pattern)),
keep_delim_(!keep_delim_pattern.empty()) {}
~RegexTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "RegexTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
protected:
Status GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8,
icu::UnicodeString *out_unicode = nullptr) const;
Status GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const;
private:
const icu::UnicodeString delim_pattern_;
const icu::UnicodeString keep_delim_pattern_;
const bool keep_delim_;
};
} // namespace dataset
} // namespace mindspore
#endif // DATASET_TEXT_REGEX_TOKENIZER_OP_H_

View File

@ -28,6 +28,7 @@ namespace mindspore {
namespace dataset {
Status UnicodeCharTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
}

View File

@ -13,8 +13,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_
#define DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_
#ifndef DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_
#define DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_
#include <memory>
#include "dataset/core/tensor.h"
@ -37,4 +37,4 @@ class UnicodeCharTokenizerOp : public TensorOp {
} // namespace dataset
} // namespace mindspore
#endif // DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_
#endif // DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_

View File

@ -0,0 +1,93 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/text/kernels/unicode_script_tokenizer_op.h"
#include <memory>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
#include "cppjieba/Unicode.hpp"
#include "unicode/errorcode.h"
#include "unicode/uchar.h"
#include "unicode/uscript.h"
using cppjieba::DecodeRunesInString;
using cppjieba::RuneStrArray;
namespace mindspore {
namespace dataset {
const bool UnicodeScriptTokenizerOp::kDefKeepWhitespace = false;
Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
}
std::string_view str;
RETURN_IF_NOT_OK(input->GetItemAt(&str, {}));
RuneStrArray runes;
if (!DecodeRunesInString(str.data(), str.size(), runes)) {
RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
}
UScriptCode last_script = USCRIPT_INVALID_CODE;
icu::ErrorCode status;
int start = 0;
int len = 0;
std::vector<std::string> splits;
bool was_space = false;
for (size_t i = 0; i < runes.size(); i++) {
bool is_space = u_isUWhiteSpace(runes[i].rune);
UScriptCode script = uscript_getScript(runes[i].rune, status);
if (status.isFailure()) {
status.reset();
script = USCRIPT_INVALID_CODE;
}
// 1) Seperate UTF-8 strings of different UScriptCode values
// (such as: "Chinese中国" should be splited to ["Chinese", "中国"])
// 2) Seperate whitespace and non-whitespace UTF-8 strings
// (such as: " ." should be split to [" ", "."])
if (len > 0 && (script != last_script || is_space != was_space)) {
// 3) If keep_whitespace_ is false, all the whitespace characters will be discard
if (keep_whitespace_ || !was_space) {
std::string temp(str.substr(start, len));
splits.emplace_back(std::move(temp));
}
start = runes[i].offset;
len = runes[i].len;
} else {
len += runes[i].len;
}
last_script = script;
was_space = is_space;
}
if (len > 0 && (keep_whitespace_ || !was_space)) {
std::string temp(str.substr(start, len));
splits.emplace_back(std::move(temp));
}
// 4) If the input is empty scalar string, the output will be 1-D empty string.
if (splits.empty()) {
splits.emplace_back("");
}
*output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -0,0 +1,44 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATASET_TEXT_KERNELS_UNICODE_SCRIPT_TOKENIZER_OP_H_
#define DATASET_TEXT_KERNELS_UNICODE_SCRIPT_TOKENIZER_OP_H_
#include <memory>
#include "dataset/core/tensor.h"
#include "dataset/kernels/tensor_op.h"
#include "dataset/util/status.h"
namespace mindspore {
namespace dataset {
class UnicodeScriptTokenizerOp : public TensorOp {
public:
static const bool kDefKeepWhitespace;
explicit UnicodeScriptTokenizerOp(bool keep_whitespace = kDefKeepWhitespace) : keep_whitespace_(keep_whitespace) {}
~UnicodeScriptTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "UnicodeScriptTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
private:
bool keep_whitespace_; // If or not keep whitespace tokens
};
} // namespace dataset
} // namespace mindspore
#endif // DATASET_TEXT_KERNELS_UNICODE_SCRIPT_TOKENIZER_OP_H_

View File

@ -0,0 +1,73 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/text/kernels/whitespace_tokenizer_op.h"
#include <memory>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
#include "cppjieba/Unicode.hpp"
#include "unicode/errorcode.h"
#include "unicode/uchar.h"
#include "unicode/uscript.h"
using cppjieba::DecodeRunesInString;
using cppjieba::RuneStrArray;
namespace mindspore {
namespace dataset {
Status WhitespaceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
}
std::string_view str;
RETURN_IF_NOT_OK(input->GetItemAt(&str, {}));
RuneStrArray runes;
if (!DecodeRunesInString(str.data(), str.size(), runes)) {
RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
}
std::vector<std::string> splits;
int start = 0;
int len = 0;
for (size_t i = 0; i < runes.size(); i++) {
if (u_isUWhiteSpace(runes[i].rune)) {
if (len > 0) {
std::string temp(str.substr(start, len));
splits.emplace_back(std::move(temp));
len = 0;
}
} else {
if (len == 0) {
start = runes[i].offset;
}
len += runes[i].len;
}
}
if (len > 0) {
std::string temp(str.substr(start, len));
splits.emplace_back(std::move(temp));
}
if (splits.empty()) {
splits.emplace_back("");
}
*output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -0,0 +1,39 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATASET_TEXT_KERNELS_WHITESPACE_TOKENIZER_OP_H_
#define DATASET_TEXT_KERNELS_WHITESPACE_TOKENIZER_OP_H_
#include <memory>
#include "dataset/core/tensor.h"
#include "dataset/kernels/tensor_op.h"
#include "dataset/util/status.h"
namespace mindspore {
namespace dataset {
class WhitespaceTokenizerOp : public TensorOp {
public:
WhitespaceTokenizerOp() {}
~WhitespaceTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "WhitespaceTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
};
} // namespace dataset
} // namespace mindspore
#endif // DATASET_TEXT_KERNELS_WHITESPACE_TOKENIZER_OP_H_

View File

@ -0,0 +1,138 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/text/kernels/wordpiece_tokenizer_op.h"
#include <algorithm>
#include <utility>
namespace mindspore {
namespace dataset {
const char WordpieceTokenizerOp::kDefSuffixIndicator[] = "##";
const int WordpieceTokenizerOp::kDefMaxBytesPerToken = 100;
const char WordpieceTokenizerOp::kDefUnknownToken[] = "[UNK]";
WordpieceTokenizerOp::WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
const int &max_bytes_per_token, const std::string &unknown_token)
: vocab_(vocab),
suffix_indicator_(suffix_indicator),
max_bytes_per_token_(max_bytes_per_token),
unknown_token_(unknown_token) {}
void WordpieceTokenizerOp::PadTokens(const std::vector<std::vector<std::string>> &tokens, const std::string &padded_str,
std::vector<std::string> *out_padded_tokens, int *out_cols) const {
int rows = tokens.size();
int max_cols = 0;
for (int i = 0; i < rows; i++) {
max_cols = std::max(max_cols, static_cast<int>(tokens[i].size()));
}
out_padded_tokens->resize(rows * max_cols, padded_str);
for (int i = 0; i < rows; i++) {
int index = i * max_cols;
for (int j = 0; j < tokens[i].size(); j++) {
(*out_padded_tokens)[index++] = tokens[i][j];
}
}
*out_cols = max_cols;
}
Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start,
bool *out_found, int *out_end) const {
CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && start < input_token.size(), "Out of range");
*out_found = false;
for (int i = runes.size() - 1; i >= 0; i--) {
*out_end = runes[i].offset + runes[i].len;
int len = *out_end - start;
std::string word = input_token.substr(start, len);
if (start > 0) {
word = suffix_indicator_ + word;
}
WordIdType default_id = -1;
if (vocab_->Lookup(word, default_id) != default_id) {
*out_found = true;
break;
}
}
return Status::OK();
}
Status WordpieceTokenizerOp::FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const {
out_tokens->clear();
if (unknown_token_.empty()) {
out_tokens->emplace_back(input_token);
} else {
out_tokens->emplace_back(unknown_token_);
}
return Status::OK();
}
Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const int start, const int end,
std::vector<std::string> *out_tokens) const {
CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && end > start && end <= input_token.size(), "Out of range");
std::string subword = input_token.substr(start, end - start);
if (start > 0) {
subword = suffix_indicator_ + subword;
}
out_tokens->emplace_back(subword);
return Status::OK();
}
Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vector<std::string> *out_tokens) const {
if (input_token.size() > max_bytes_per_token_) {
return FoundNoToken(input_token, out_tokens);
}
RuneStrArray runes;
if (!DecodeRunesInString(input_token.data(), input_token.size(), runes)) {
RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
}
int end;
for (int start = 0; start < input_token.size();) {
bool found;
RETURN_IF_NOT_OK(LookupWord(input_token, runes, start, &found, &end));
if (found) {
RETURN_IF_NOT_OK(AddSubword(input_token, start, end, out_tokens));
start = end;
} else {
return FoundNoToken(input_token, out_tokens);
}
}
return Status::OK();
}
Status WordpieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
if (input->Rank() > 1 || input->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar or 1-D string tensor");
}
std::vector<std::vector<std::string>> out_tokens(input->Size());
int i = 0;
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &out_tokens[i++]));
}
std::vector<std::string> padded_tokens;
int cols = 0;
PadTokens(out_tokens, "<pad>", &padded_tokens, &cols);
std::vector<dsize_t> shapes;
if (input->Rank() == 1) {
shapes.push_back(out_tokens.size());
}
shapes.push_back(cols);
*output = std::make_shared<Tensor>(std::move(padded_tokens), TensorShape(shapes));
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -0,0 +1,68 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATASET_TEXT_KERNELS_WORDPIECE_TOKENIZER_OP_H_
#define DATASET_TEXT_KERNELS_WORDPIECE_TOKENIZER_OP_H_
#include <memory>
#include <string>
#include <string_view>
#include <vector>
#include "cppjieba/Unicode.hpp"
#include "dataset/core/tensor.h"
#include "dataset/kernels/tensor_op.h"
#include "dataset/text/vocab.h"
#include "dataset/util/status.h"
using cppjieba::DecodeRunesInString;
using cppjieba::RuneStrArray;
namespace mindspore {
namespace dataset {
class WordpieceTokenizerOp : public TensorOp {
public:
static const char kDefSuffixIndicator[];
static const int kDefMaxBytesPerToken;
static const char kDefUnknownToken[];
WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = kDefSuffixIndicator,
const int &max_bytes_per_token = kDefMaxBytesPerToken,
const std::string &unknown_token = kDefUnknownToken);
~WordpieceTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "WordpieceTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
protected:
void PadTokens(const std::vector<std::vector<std::string>> &tokens, const std::string &padded_str,
std::vector<std::string> *out_padded_tokens, int *out_cols) const;
Status AddSubword(const std::string &input_token, const int start, const int end,
std::vector<std::string> *out_token) const;
Status FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const;
Status LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start, bool *out_found,
int *out_end) const;
Status GetTokens(const std::string &input_token, std::vector<std::string> *out_tokens) const;
private:
const std::shared_ptr<Vocab> vocab_;
const std::string suffix_indicator_;
const int max_bytes_per_token_;
const std::string unknown_token_;
};
} // namespace dataset
} // namespace mindspore
#endif // DATASET_TEXT_KERNELS_WORDPIECE_TOKENIZER_OP_H_

View File

@ -15,5 +15,18 @@
"""
mindspore.dataset.text
"""
from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram
from .utils import to_str, to_bytes, JiebaMode, Vocab
import platform
from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer
from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm
__all__ = [
"Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram",
"to_str", "to_bytes", "JiebaMode", "Vocab", "WordpieceTokenizer"
]
if platform.system().lower() != 'windows':
from .transforms import UnicodeScriptTokenizer, WhitespaceTokenizer, CaseFold, NormalizeUTF8, \
RegexReplace, RegexTokenizer, BasicTokenizer, BertTokenizer
__all__.append(["UnicodeScriptTokenizer", "WhitespaceTokenizer", "CaseFold", "NormalizeUTF8",
"RegexReplace", "RegexTokenizer", "BasicTokenizer", "BertTokenizer", "NormalizeForm"])

View File

@ -17,10 +17,11 @@ c transforms for all text related operators
import os
import re
import platform
import mindspore._c_dataengine as cde
from .utils import JiebaMode
from .utils import JiebaMode, NormalizeForm
from .validators import check_lookup, check_jieba_add_dict, \
check_jieba_add_word, check_jieba_init, check_ngram
@ -174,3 +175,172 @@ class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp):
"""
Tokenize a scalar tensor of UTF-8 string to Unicode characters.
"""
class WordpieceTokenizer(cde.WordpieceTokenizerOp):
"""
Tokenize scalar token or 1-D tokens to subword tokens.
Args
vocab(Vocab): a Vocab object.
suffix_indicator(string, optional): Used to show that the subword is the last part of a word(default '##').
max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default 100).
unknown_token(string, optional): When we can not found the token: if 'unknown_token' is empty string,
return the token directly, else return 'unknown_token'(default '[UNK]').
"""
def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]'):
self.vocab = vocab
self.suffix_indicator = suffix_indicator
self.max_bytes_per_token = max_bytes_per_token
self.unknown_token = unknown_token
super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token)
if platform.system().lower() != 'windows':
class WhitespaceTokenizer(cde.WhitespaceTokenizerOp):
"""
Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\t', '\r', '\n').
"""
class UnicodeScriptTokenizer(cde.UnicodeScriptTokenizerOp):
"""
Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
Args:
keep_whitespace(bool, optional): If or not emit whitespace tokens (default False)
"""
def __init__(self, keep_whitespace=False):
self.keep_whitespace = keep_whitespace
super().__init__(self.keep_whitespace)
class CaseFold(cde.CaseFoldOp):
"""
Apply case fold operation on utf-8 string tensor.
"""
DE_C_INTER_NORMALIZE_FORM = {
NormalizeForm.NONE: cde.NormalizeForm.DE_NORMALIZE_NONE,
NormalizeForm.NFC: cde.NormalizeForm.DE_NORMALIZE_NFC,
NormalizeForm.NFKC: cde.NormalizeForm.DE_NORMALIZE_NFKC,
NormalizeForm.NFD: cde.NormalizeForm.DE_NORMALIZE_NFD,
NormalizeForm.NFKD: cde.NormalizeForm.DE_NORMALIZE_NFKD
}
class NormalizeUTF8(cde.NormalizeUTF8Op):
"""
Apply normalize operation on utf-8 string tensor.
Args:
normalize_form(Enum, optional): Valid values are "NONE", "NFC", "NFKC", "NFD", "NFKD".
If set "NONE", will do nothing for input string tensor.
If set to any of "NFC", "NFKC", "NFD", "NFKD", will apply normalize operation(default "NFKC").
See http://unicode.org/reports/tr15/ for details.
"""
def __init__(self, normalize_form=NormalizeForm.NFKC):
self.normalize_form = DE_C_INTER_NORMALIZE_FORM[normalize_form]
super().__init__(self.normalize_form)
class RegexReplace(cde.RegexReplaceOp):
"""
Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'.
See http://userguide.icu-project.org/strings/regexp for support regex pattern.
Args:
pattern(string): the regex expression patterns.
replace(string): the string to replace matched element.
replace_all(bool, optional): If False, only replace first matched element;
if True, replace all matched elements(default True).
"""
def __init__(self, pattern, replace, replace_all=True):
self.pattern = pattern
self.replace = replace
self.replace_all = replace_all
super().__init__(self.pattern, self.replace, self.replace_all)
class RegexTokenizer(cde.RegexTokenizerOp):
"""
Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
See http://userguide.icu-project.org/strings/regexp for support regex pattern.
Args:
delim_pattern(string): The pattern of regex delimiters.
The original string will be split by matched elements.
keep_delim_pattern(string, optional): The string matched by 'delim_pattern' can be kept as a token
if it can be matched by 'keep_delim_pattern'. And the default value is empty string(''),
in this situation, delimiters will not kept as a output token.
"""
def __init__(self, delim_pattern, keep_delim_pattern=''):
self.delim_pattern = delim_pattern
self.keep_delim_pattern = keep_delim_pattern
super().__init__(self.delim_pattern, self.keep_delim_pattern)
class BasicTokenizer(cde.BasicTokenizerOp):
"""
Tokenize a scalar tensor of UTF-8 string by specific rules.
Args:
lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
on input text to make the text to lower case and strip accents characters; If False, only apply
NormalizeUTF8('normalization_form' mode) operation on input text(default False).
keep_whitespace(bool, optional), If True, the whitespace will be kept in out tokens(default False).
normalization_form(Enum, optional), Used to specify a specific normlaize mode,
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default 'NONE').
preserve_unused_token(bool, optional), If True, do not split special tokens like
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default True).
"""
def __init__(self, lower_case=False, keep_whitespace=False,
normalization_form=NormalizeForm.NONE, preserve_unused_token=True):
self.lower_case = lower_case
self.keep_whitespace = keep_whitespace
self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
self.preserve_unused_token = preserve_unused_token
super().__init__(self.lower_case, self.keep_whitespace,
self.normalization_form, self.preserve_unused_token)
class BertTokenizer(cde.BertTokenizerOp):
"""
Tokenizer used for Bert text process.
Args:
vocab(Vocab): a Vocab object.
suffix_indicator(string, optional): Used to show that the subword is the last part of a word(default '##').
max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default 100).
unknown_token(string, optional): When we can not found the token: if 'unknown_token' is empty string,
return the token directly, else return 'unknown_token'(default '[UNK]').
lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
on input text to make the text to lower case and strip accents characters; If False, only apply
NormalizeUTF8('normalization_form' mode) operation on input text(default False).
keep_whitespace(bool, optional), If True, the whitespace will be kept in out tokens(default False).
normalization_form(Enum, optional), Used to specify a specific normlaize mode,
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default 'NONE').
preserve_unused_token(bool, optional), If True, do not split special tokens like
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default True).
"""
def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100,
unknown_token='[UNK]', lower_case=False, keep_whitespace=False,
normalization_form=NormalizeForm.NONE, preserve_unused_token=True):
self.vocab = vocab
self.suffix_indicator = suffix_indicator
self.max_bytes_per_token = max_bytes_per_token
self.unknown_token = unknown_token
self.lower_case = lower_case
self.keep_whitespace = keep_whitespace
self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
self.preserve_unused_token = preserve_unused_token
super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token,
self.lower_case, self.keep_whitespace, self.normalization_form, self.preserve_unused_token)

View File

@ -127,3 +127,11 @@ class JiebaMode(IntEnum):
MIX = 0
MP = 1
HMM = 2
class NormalizeForm(IntEnum):
NONE = 0
NFC = 1
NFKC = 2
NFD = 3
NFKD = 4

View File

@ -18,7 +18,14 @@
#include <string_view>
#include "common/common.h"
#include "dataset/text/kernels/basic_tokenizer_op.h"
#include "dataset/text/kernels/case_fold_op.h"
#include "dataset/text/kernels/normalize_utf8_op.h"
#include "dataset/text/kernels/regex_replace_op.h"
#include "dataset/text/kernels/regex_tokenizer_op.h"
#include "dataset/text/kernels/unicode_char_tokenizer_op.h"
#include "dataset/text/kernels/unicode_script_tokenizer_op.h"
#include "dataset/text/kernels/whitespace_tokenizer_op.h"
#include "gtest/gtest.h"
#include "utils/log_adapter.h"
@ -105,3 +112,229 @@ TEST_F(MindDataTestTokenizerOp, TestUnicodeCharTokenizerOp) {
MS_LOG(INFO) << "Out tensor6: " << output->ToString();
CheckEqual(output, {0}, "");
}
TEST_F(MindDataTestTokenizerOp, TestWhitespaceTokenizerOp) {
MS_LOG(INFO) << "Doing TestWhitespaceTokenizerOp.";
std::unique_ptr<WhitespaceTokenizerOp> op(new WhitespaceTokenizerOp());
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China.");
std::shared_ptr<Tensor> output;
Status s = op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 3);
EXPECT_EQ(output->Rank(), 1);
MS_LOG(INFO) << "Out tensor1: " << output->ToString();
CheckEqual(output, {0}, "Welcome");
CheckEqual(output, {1}, "to");
CheckEqual(output, {2}, "China.");
input = std::make_shared<Tensor>(" hello");
s = op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1);
EXPECT_EQ(output->Rank(), 1);
MS_LOG(INFO) << "Out tensor2: " << output->ToString();
CheckEqual(output, {0}, "hello");
input = std::make_shared<Tensor>("hello");
s = op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1);
EXPECT_EQ(output->Rank(), 1);
MS_LOG(INFO) << "Out tensor3: " << output->ToString();
CheckEqual(output, {0}, "hello");
input = std::make_shared<Tensor>("hello ");
s = op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1);
EXPECT_EQ(output->Rank(), 1);
MS_LOG(INFO) << "Out tensor4: " << output->ToString();
CheckEqual(output, {0}, "hello");
input = std::make_shared<Tensor>(" ");
s = op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1);
EXPECT_EQ(output->Rank(), 1);
MS_LOG(INFO) << "Out tensor5: " << output->ToString();
CheckEqual(output, {0}, "");
}
TEST_F(MindDataTestTokenizerOp, TestUnicodeScriptTokenizer) {
MS_LOG(INFO) << "Doing TestUnicodeScriptTokenizer.";
std::unique_ptr<UnicodeScriptTokenizerOp> keep_whitespace_op(new UnicodeScriptTokenizerOp(true));
std::unique_ptr<UnicodeScriptTokenizerOp> skip_whitespace_op(new UnicodeScriptTokenizerOp(false));
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
std::shared_ptr<Tensor> output;
Status s = keep_whitespace_op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 10);
EXPECT_EQ(output->Rank(), 1);
MS_LOG(INFO) << "Out tensor1: " << output->ToString();
CheckEqual(output, {0}, "Welcome");
CheckEqual(output, {1}, " ");
CheckEqual(output, {2}, "to");
CheckEqual(output, {3}, " ");
CheckEqual(output, {4}, "China");
CheckEqual(output, {5}, ".");
CheckEqual(output, {6}, " \n ");
CheckEqual(output, {7}, "中国");
CheckEqual(output, {8}, "\t");
CheckEqual(output, {9}, "北京");
s = skip_whitespace_op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 6);
EXPECT_EQ(output->Rank(), 1);
MS_LOG(INFO) << "Out tensor2: " << output->ToString();
CheckEqual(output, {0}, "Welcome");
CheckEqual(output, {1}, "to");
CheckEqual(output, {2}, "China");
CheckEqual(output, {3}, ".");
CheckEqual(output, {4}, "中国");
CheckEqual(output, {5}, "北京");
input = std::make_shared<Tensor>(" Welcome to 中国. ");
s = skip_whitespace_op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 4);
EXPECT_EQ(output->Rank(), 1);
MS_LOG(INFO) << "Out tensor3: " << output->ToString();
CheckEqual(output, {0}, "Welcome");
CheckEqual(output, {1}, "to");
CheckEqual(output, {2}, "中国");
CheckEqual(output, {3}, ".");
s = keep_whitespace_op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 8);
EXPECT_EQ(output->Rank(), 1);
MS_LOG(INFO) << "Out tensor4: " << output->ToString();
CheckEqual(output, {0}, " ");
CheckEqual(output, {1}, "Welcome");
CheckEqual(output, {2}, " ");
CheckEqual(output, {3}, "to");
CheckEqual(output, {4}, " ");
CheckEqual(output, {5}, "中国");
CheckEqual(output, {6}, ".");
CheckEqual(output, {7}, " ");
input = std::make_shared<Tensor>("Hello");
s = keep_whitespace_op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1);
EXPECT_EQ(output->Rank(), 1);
MS_LOG(INFO) << "Out tensor5: " << output->ToString();
CheckEqual(output, {0}, "Hello");
input = std::make_shared<Tensor>("H");
s = keep_whitespace_op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1);
EXPECT_EQ(output->Rank(), 1);
MS_LOG(INFO) << "Out tensor6: " << output->ToString();
CheckEqual(output, {0}, "H");
input = std::make_shared<Tensor>("");
s = keep_whitespace_op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1);
EXPECT_EQ(output->Rank(), 1);
MS_LOG(INFO) << "Out tensor7: " << output->ToString();
CheckEqual(output, {0}, "");
input = std::make_shared<Tensor>("Hello中国Hello世界");
s = keep_whitespace_op->Compute(input, &output); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 4);
EXPECT_EQ(output->Rank(), 1);
MS_LOG(INFO) << "Out tensor8: " << output->ToString();
CheckEqual(output, {0}, "Hello");
CheckEqual(output, {1}, "中国");
CheckEqual(output, {2}, "Hello");
CheckEqual(output, {3}, "世界");
input = std::make_shared<Tensor>(" ");
s = keep_whitespace_op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1);
EXPECT_EQ(output->Rank(), 1);
MS_LOG(INFO) << "Out tensor10: " << output->ToString();
CheckEqual(output, {0}, " ");
input = std::make_shared<Tensor>(" ");
s = skip_whitespace_op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1);
EXPECT_EQ(output->Rank(), 1);
MS_LOG(INFO) << "Out tensor11: " << output->ToString();
CheckEqual(output, {0}, "");
}
TEST_F(MindDataTestTokenizerOp, TestCaseFold) {
MS_LOG(INFO) << "Doing TestCaseFold.";
std::unique_ptr<CaseFoldOp> case_fold_op(new CaseFoldOp());
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
std::shared_ptr<Tensor> output;
Status s = case_fold_op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1);
EXPECT_EQ(output->Rank(), 0);
MS_LOG(INFO) << "Out tensor1: " << output->ToString();
CheckEqual(output, {}, "welcome to china. \n 中国\t北京");
}
TEST_F(MindDataTestTokenizerOp, TestNormalize) {
MS_LOG(INFO) << "Doing TestNormalize.";
std::unique_ptr<NormalizeUTF8Op> nfc_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfc));
std::unique_ptr<NormalizeUTF8Op> nfkc_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfkc));
std::unique_ptr<NormalizeUTF8Op> nfd_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfd));
std::unique_ptr<NormalizeUTF8Op> nfkd_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfkd));
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("");
std::shared_ptr<Tensor> output;
Status s = nfc_normalize_op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
MS_LOG(INFO) << "NFC str:" << output->ToString();
nfkc_normalize_op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
MS_LOG(INFO) << "NFKC str:" << output->ToString();
nfd_normalize_op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
MS_LOG(INFO) << "NFD str:" << output->ToString();
nfkd_normalize_op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
MS_LOG(INFO) << "NFKD str:" << output->ToString();
}
TEST_F(MindDataTestTokenizerOp, TestRegexReplace) {
MS_LOG(INFO) << "Doing TestRegexReplace.";
std::unique_ptr<RegexReplaceOp> regex_replace_op(new RegexReplaceOp("\\s+", "_", true));
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
std::shared_ptr<Tensor> output;
Status s = regex_replace_op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1);
EXPECT_EQ(output->Rank(), 0);
MS_LOG(INFO) << "Out tensor1: " << output->ToString();
CheckEqual(output, {}, "Welcome_to_China._中国_北京");
}
TEST_F(MindDataTestTokenizerOp, TestRegexTokenizer) {
MS_LOG(INFO) << "Doing TestRegexTokenizerOp.";
std::unique_ptr<RegexTokenizerOp> regex_tokenizer_op(new RegexTokenizerOp("\\p{Cc}|\\p{Cf}|\\s+", ""));
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
std::shared_ptr<Tensor> output;
Status s = regex_tokenizer_op->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
}
TEST_F(MindDataTestTokenizerOp, TestBasicTokenizer) {
MS_LOG(INFO) << "Doing TestBasicTokenizer.";
//bool lower_case, bool keep_whitespace,
// NormalizeForm normalization_form, bool preserve_unused_token
std::unique_ptr<BasicTokenizerOp> basic_tokenizer(new BasicTokenizerOp(true, true, NormalizeForm::kNone, false));
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. 中国\t北京");
std::shared_ptr<Tensor> output;
Status s = basic_tokenizer->Compute(input, &output);
EXPECT_TRUE(s.IsOk());
}

View File

@ -0,0 +1,7 @@
Welcome to Beijing北京欢迎您
長風破浪會有時,直掛雲帆濟滄海
😀嘿嘿😃哈哈😄大笑😁嘻嘻
明朝1368—1644年和清朝1644—1911年是中国封建王朝史上最后两个朝代
明代1368-1644と清代1644-1911は、中国の封建王朝の歴史における最後の2つの王朝でした
명나라 (1368-1644)와 청나라 (1644-1911)는 중국 봉건 왕조의 역사에서 마지막 두 왕조였다
Tĥïŝ ĩš â fůňķŷ Šťŕĭńġ

View File

@ -0,0 +1,14 @@
床前明月光
疑是地上霜
举头望明月
低头思故乡
I am making small mistakes during working hours
😀嘿嘿😃哈哈😄大笑😁嘻嘻
繁體字
unused [CLS]
unused [SEP]
unused [UNK]
unused [PAD]
unused [MASK]
12+/-28=40/-16
Hello World!

View File

@ -0,0 +1,6 @@
ḍ̇
q̣̇
2⁵
ẛ̣

View File

@ -0,0 +1,8 @@
Hello World
Let's Go
1:hello
2:world
31:beijing
Welcome to China!
我 不想 长大
Welcome to Shenzhen!

View File

@ -0,0 +1,3 @@
Welcome to Shenzhen!
北京欢迎您!Welcome to Beijing!
12¥+36¥=?

View File

@ -0,0 +1,25 @@
my
favorite
book
is
love
during
the
cholera
era
what

View File

@ -0,0 +1,83 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Testing BasicTokenizer op in DE
"""
import numpy as np
import mindspore.dataset as ds
from mindspore import log as logger
import mindspore.dataset.text as nlp
BASIC_TOKENIZER_FILE = "../data/dataset/testTokenizerData/basic_tokenizer.txt"
test_paras = [
dict(
first=1,
last=6,
expected_tokens=
[['Welcome', 'to', 'Beijing', '', '', '', '', ''],
['', '', '', '', '', '', '', '', '', '', '', '', '', '', ''],
['😀', '', '', '😃', '', '', '😄', '', '', '😁', '', ''],
['', '', '', '1368', '', '1644', '', '', '', '', '',
'', '1644', '', '1911', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', ''],
['', '', '', '1368', '-', '1644', '', '', '', '',
'', '1644', '-', '1911', '', '', '', '', '', '', '',
'', '', '', '', '', '', 'における', '', '', 'の2つの', '', '', 'でした'],
['명나라', '(', '1368', '-', '1644', ')', '', '청나라', '(', '1644', '-', '1911', ')', '',
'중국', '봉건', '왕조의', '역사에서', '마지막', '', '왕조였다']]
),
dict(
first=7,
last=7,
expected_tokens=[['this', 'is', 'a', 'funky', 'string']],
lower_case=True
),
]
def check_basic_tokenizer(first, last, expected_tokens, lower_case=False, keep_whitespace=False,
normalization_form=nlp.utils.NormalizeForm.NONE, preserve_unused_token=False):
dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False)
if first > 1:
dataset = dataset.skip(first - 1)
if last >= first:
dataset = dataset.take(last - first + 1)
basic_tokenizer = nlp.BasicTokenizer(lower_case=lower_case,
keep_whitespace=keep_whitespace,
normalization_form=normalization_form,
preserve_unused_token=preserve_unused_token)
dataset = dataset.map(operations=basic_tokenizer)
count = 0
for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text'])
logger.info("Out:", text)
logger.info("Exp:", expected_tokens[count])
np.testing.assert_array_equal(text, expected_tokens[count])
count = count + 1
def test_basic_tokenizer():
"""
Test BasicTokenizer
"""
for paras in test_paras:
check_basic_tokenizer(**paras)
if __name__ == '__main__':
test_basic_tokenizer()

View File

@ -0,0 +1,183 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Testing BertTokenizer op in DE
"""
import numpy as np
import mindspore.dataset as ds
from mindspore import log as logger
import mindspore.dataset.text as nlp
BERT_TOKENIZER_FILE = "../data/dataset/testTokenizerData/bert_tokenizer.txt"
vocab_bert = [
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "",
"i", "am", "mak", "make", "small", "mistake", "##s", "during", "work", "##ing", "hour",
"😀", "😃", "😄", "😁", "+", "/", "-", "=", "12", "28", "40", "16", " ", "I",
"[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]"
]
pad = '<pad>'
test_paras = [
# test chinese text
dict(
first=1,
last=4,
expect_str=[[[''], [''], [''], [''], ['']],
[[''], [''], [''], [''], ['']],
[[''], [''], [''], [''], ['']],
[[''], [''], [''], [''], ['']]],
vocab_list=vocab_bert
),
# test english text
dict(
first=5,
last=5,
expect_str=[[['i', pad],
["am", pad],
['mak', '##ing'],
['small', pad],
['mistake', '##s'],
['during', pad],
['work', '##ing'],
['hour', '##s']]],
lower_case=True,
vocab_list=vocab_bert
),
dict(
first=5,
last=5,
expect_str=[[['I', pad],
["am", pad],
['mak', '##ing'],
['small', pad],
['mistake', '##s'],
['during', pad],
['work', '##ing'],
['hour', '##s']]],
lower_case=False,
vocab_list=vocab_bert
),
# test emoji tokens
dict(
first=6,
last=7,
expect_str=[
[['😀'], [''], [''], ['😃'], [''], [''], ['😄'], [''], [''], ['😁'], [''], ['']],
[[''], [''], ['']]],
normalization_form=nlp.utils.NormalizeForm.NFKC,
vocab_list=vocab_bert
),
# test preserved tokens
dict(
first=8,
last=12,
expect_str=[
[['[UNK]'], ['[CLS]']],
[['[UNK]'], ['[SEP]']],
[['[UNK]'], ['[UNK]']],
[['[UNK]'], ['[PAD]']],
[['[UNK]'], ['[MASK]']],
],
lower_case=False,
vocab_list=vocab_bert,
preserve_unused_token=True,
),
# test special symbol
dict(
first=13,
last=13,
expect_str=[[['12'], ['+'], ['/'], ['-'], ['28'], ['='], ['40'], ['/'], ['-'], ['16']]],
preserve_unused_token=True,
vocab_list=vocab_bert
),
# test non-default parms
dict(
first=8,
last=8,
expect_str=[
[['[UNK]'], [' '], ['[CLS]']],
],
lower_case=False,
vocab_list=vocab_bert,
preserve_unused_token=True,
keep_whitespace=True
),
dict(
first=8,
last=8,
expect_str=[
[['unused'], [' '], ['[CLS]']],
],
lower_case=False,
vocab_list=vocab_bert,
preserve_unused_token=True,
keep_whitespace=True,
unknown_token=''
),
dict(
first=8,
last=8,
expect_str=[
[['unused'], [' '], ['['], ['CLS'], [']']],
],
lower_case=False,
vocab_list=vocab_bert,
preserve_unused_token=False,
keep_whitespace=True,
unknown_token=''
),
]
def check_bert_tokenizer(first, last, expect_str,
vocab_list,
suffix_indicator='##',
max_bytes_per_token=100, unknown_token='[UNK]',
lower_case=False, keep_whitespace=False,
normalization_form=nlp.utils.NormalizeForm.NONE,
preserve_unused_token=False):
dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False)
if first > 1:
dataset = dataset.skip(first - 1)
if last >= first:
dataset = dataset.take(last - first + 1)
vocab = nlp.Vocab.from_list(vocab_list)
tokenizer_op = nlp.BertTokenizer(
vocab=vocab, suffix_indicator=suffix_indicator,
max_bytes_per_token=max_bytes_per_token, unknown_token=unknown_token,
lower_case=lower_case, keep_whitespace=keep_whitespace,
normalization_form=normalization_form,
preserve_unused_token=preserve_unused_token)
dataset = dataset.map(operations=tokenizer_op)
count = 0
for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text'])
logger.info("Out:", text)
logger.info("Exp:", expect_str[count])
np.testing.assert_array_equal(text, expect_str[count])
count = count + 1
def test_bert_tokenizer():
"""
Test WordpieceTokenizer
"""
for paras in test_paras:
check_bert_tokenizer(**paras)
if __name__ == '__main__':
test_bert_tokenizer()

View File

@ -15,11 +15,15 @@
"""
Testing UnicodeCharTokenizer op in DE
"""
import numpy as np
import mindspore.dataset as ds
from mindspore import log as logger
import mindspore.dataset.text as nlp
DATA_FILE = "../data/dataset/testTokenizerData/1.txt"
NORMALIZE_FILE = "../data/dataset/testTokenizerData/normalize.txt"
REGEX_REPLACE_FILE = "../data/dataset/testTokenizerData/regex_replace.txt"
REGEX_TOKENIZER_FILE = "../data/dataset/testTokenizerData/regex_tokenizer.txt"
def split_by_unicode_char(input_strs):
@ -48,5 +52,182 @@ def test_unicode_char_tokenizer():
assert split_by_unicode_char(input_strs) == tokens
def test_whitespace_tokenizer():
"""
Test WhitespaceTokenizer
"""
whitespace_strs = [["Welcome", "to", "Beijing!"],
["北京欢迎您!"],
["我喜欢English!"],
[""]]
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
tokenizer = nlp.WhitespaceTokenizer()
dataset = dataset.map(operations=tokenizer)
tokens = []
for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text']).tolist()
tokens.append(text)
logger.info("The out tokens is : {}".format(tokens))
assert whitespace_strs == tokens
def test_unicode_script_tokenizer():
"""
Test UnicodeScriptTokenizer when para keep_whitespace=False
"""
unicode_script_strs = [["Welcome", "to", "Beijing", "!"],
["北京欢迎您", ""],
["我喜欢", "English", "!"],
[""]]
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=False)
dataset = dataset.map(operations=tokenizer)
tokens = []
for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text']).tolist()
tokens.append(text)
logger.info("The out tokens is : {}".format(tokens))
assert unicode_script_strs == tokens
def test_unicode_script_tokenizer2():
"""
Test UnicodeScriptTokenizer when para keep_whitespace=True
"""
unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
["北京欢迎您", ""],
["我喜欢", "English", "!"],
[" "]]
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=True)
dataset = dataset.map(operations=tokenizer)
tokens = []
for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text']).tolist()
tokens.append(text)
logger.info("The out tokens is :", tokens)
assert unicode_script_strs2 == tokens
def test_case_fold():
"""
Test CaseFold
"""
expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "]
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
op = nlp.CaseFold()
dataset = dataset.map(operations=op)
lower_strs = []
for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text']).tolist()
lower_strs.append(text)
assert lower_strs == expect_strs
def test_normalize_utf8():
"""
Test NormalizeUTF8
"""
def normalize(normalize_form):
dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False)
normalize = nlp.NormalizeUTF8(normalize_form=normalize_form)
dataset = dataset.map(operations=normalize)
out_bytes = []
out_texts = []
for i in dataset.create_dict_iterator():
out_bytes.append(i['text'])
out_texts.append(nlp.to_str(i['text']).tolist())
logger.info("The out bytes is : ", out_bytes)
logger.info("The out texts is: ", out_texts)
return out_bytes
expect_normlize_data = [
# NFC
[b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xe1\xba\x9b\xcc\xa3'],
# NFKC
[b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
b'fi', b'25', b'\xe1\xb9\xa9'],
# NFD
[b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xc5\xbf\xcc\xa3\xcc\x87'],
# NFKD
[b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
b'fi', b'25', b's\xcc\xa3\xcc\x87']
]
assert normalize(nlp.utils.NormalizeForm.NFC) == expect_normlize_data[0]
assert normalize(nlp.utils.NormalizeForm.NFKC) == expect_normlize_data[1]
assert normalize(nlp.utils.NormalizeForm.NFD) == expect_normlize_data[2]
assert normalize(nlp.utils.NormalizeForm.NFKD) == expect_normlize_data[3]
def test_regex_replace():
"""
Test RegexReplace
"""
def regex_replace(first, last, expect_str, pattern, replace):
dataset = ds.TextFileDataset(REGEX_REPLACE_FILE, shuffle=False)
if first > 1:
dataset = dataset.skip(first - 1)
if last >= first:
dataset = dataset.take(last - first + 1)
replace_op = nlp.RegexReplace(pattern, replace)
dataset = dataset.map(operations=replace_op)
out_text = []
for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text']).tolist()
out_text.append(text)
logger.info("Out:", out_text)
logger.info("Exp:", expect_str)
assert expect_str == out_text
regex_replace(1, 2, ['H____ W____', "L__'_ G_"], "\\p{Ll}", '_')
regex_replace(3, 5, ['hello', 'world', '31:beijing'], "^(\\d:|b:)", "")
regex_replace(6, 6, ["WelcometoChina!"], "\\s+", "")
regex_replace(7, 8, ['我不想长大', 'WelcometoShenzhen!'], "\\p{Cc}|\\p{Cf}|\\s+", "")
def test_regex_tokenizer():
"""
Test RegexTokenizer
"""
def regex_tokenizer(first, last, expect_str, delim_pattern, keep_delim_pattern):
dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
if first > 1:
dataset = dataset.skip(first - 1)
if last >= first:
dataset = dataset.take(last - first + 1)
tokenizer_op = nlp.RegexTokenizer(delim_pattern, keep_delim_pattern)
dataset = dataset.map(operations=tokenizer_op)
out_text = []
count = 0
for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text']).tolist()
np.testing.assert_array_equal(text, expect_str[count])
count += 1
out_text.append(text)
logger.info("Out:", out_text)
logger.info("Exp:", expect_str)
regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], "\\s+", "")
regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], "\\s+", "\\s+")
regex_tokenizer(2, 2, [['', '', '', '', '', '!Welcome to Beijing!']], r"\p{Han}", r"\p{Han}")
regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+")
regex_tokenizer(3, 3, [['12', '36']], r"[\p{P}|\p{S}]+", "")
regex_tokenizer(3, 3, [['¥+', '¥=?']], r"[\p{N}]+", "")
if __name__ == '__main__':
test_unicode_char_tokenizer()
test_whitespace_tokenizer()
test_unicode_script_tokenizer()
test_unicode_script_tokenizer2()
test_case_fold()
test_normalize_utf8()
test_regex_replace()
test_regex_tokenizer()

View File

@ -0,0 +1,113 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Testing WordpieceTokenizer op in DE
"""
import numpy as np
import mindspore.dataset as ds
from mindspore import log as logger
import mindspore.dataset.text as nlp
WORDPIECE_TOKENIZER_FILE = "../data/dataset/testTokenizerData/wordpiece_tokenizer.txt"
vocab_english = [
"book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"
]
vocab_chinese = [
"", '', '', '', '', '', '', '', '', '', '', '', ''
]
vocab_mix = vocab_chinese + vocab_english
test_paras = [
dict(
first=1,
last=10,
expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'],
['era'], ['[UNK]']],
vocab_list=vocab_english
),
dict(
first=1,
last=10,
expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'],
['era'], ['what']],
vocab_list=vocab_english,
unknown_token=""
),
dict(
first=1,
last=10,
expect_str=[['my'], ['[UNK]'], ['book'], ['is'], ['love'], ['[UNK]'], ['the'], ['[UNK]'], ['era'], ['[UNK]']],
vocab_list=vocab_english,
max_bytes_per_token=4
),
dict(
first=11,
last=25,
expect_str=[[''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''],
['[UNK]']],
vocab_list=vocab_chinese,
),
dict(
first=25,
last=25,
expect_str=[['']],
vocab_list=vocab_chinese,
unknown_token=""
),
dict(
first=1,
last=25,
expect_str=[
['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], ['era'],
['[UNK]'],
[''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''],
['[UNK]']],
vocab_list=vocab_mix,
),
]
def check_wordpiece_tokenizer(first, last, expect_str, vocab_list, unknown_token='[UNK]', max_bytes_per_token=100):
dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False)
if first > 1:
dataset = dataset.skip(first - 1)
if last >= first:
dataset = dataset.take(last - first + 1)
vocab = nlp.Vocab.from_list(vocab_list)
tokenizer_op = nlp.WordpieceTokenizer(vocab=vocab, unknown_token=unknown_token,
max_bytes_per_token=max_bytes_per_token)
dataset = dataset.map(operations=tokenizer_op)
count = 0
for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text'])
logger.info("Out:", text)
logger.info("Exp:", expect_str[count])
np.testing.assert_array_equal(text, expect_str[count])
count = count + 1
def test_wordpiece_tokenizer():
"""
Test WordpieceTokenizer
"""
for paras in test_paras:
check_wordpiece_tokenizer(**paras)
if __name__ == '__main__':
test_wordpiece_tokenizer()

6
third_party/icu4c/filter.json vendored Normal file
View File

@ -0,0 +1,6 @@
{
"strategy": "additive",
"featureFilters": {
"normalization": "include"
}
}