forked from mindspore-Ecosystem/mindspore
Add WhitespaceTokenizer and UnicodeScriptTokenizer for nlp
add CaseFold, NormalizeUTF8 add RegexReplace add RegexTokenizer add BasicTokenizer add WordpieceTokenizer add BertTokenizer
This commit is contained in:
parent
ea37dc76f0
commit
4f16f036be
|
@ -3057,6 +3057,587 @@ Software: tinyxml2 8.0.0
|
|||
Copyright 2011, John Resig.
|
||||
Copyright 2011, The Dojo Foundation.
|
||||
|
||||
Software: icu 67.1
|
||||
Copyright (C) 2000-2004, International Business Machines Corporation
|
||||
Copyright (C) 2002-2014, International Business Machines(C) Copyright IBM Corp. 1998-2011 - All Rights Reserved
|
||||
Copyright (C) 2003-2008, International Business Machines
|
||||
Copyright (C) 2005-2006, International Business Machines
|
||||
Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
Copyright (c) 2001-2010 International Business Machines
|
||||
Copyright (C) 2009, International Business Machines
|
||||
Copyright (c) 2010-2015 International Business Machines Corporation and others. All rights reserved.
|
||||
Copyright (C) 2002-2015, International Business Machines verbatim (minus copyright and #include) and copied together into this file.
|
||||
Copyright (c) 1997-2014, International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (c) 1997-2008, International Business Machines Corporation and
|
||||
Copyright (c) 1997-2003, International Business Machines Corporation and
|
||||
Copyright (c) 1996-2012, International Business Machines Corporation and
|
||||
Copyright (c) 1997-2016, International Business Machines
|
||||
Copyright (c) 1997-2013 International Business Machines
|
||||
Copyright (c) 1997-2016, International Business Machines Corporation and
|
||||
Copyright (c) 1997-2001, International Business Machines Corporation and
|
||||
Copyright (c) 1997-2012, International Business Machines Corporation and
|
||||
Copyright (c) 1997-2005, International Business Machines Corporation and
|
||||
Copyright (c) 1997-2010, International Business Machines Corporation and
|
||||
Copyright (c) 2011-2016, International Business Machines Corporation
|
||||
Copyright (c) 1997-2009, International Business Machines Corporation and
|
||||
Copyright (c) 1997-2002,2008, International Business Machines Corporation and
|
||||
Copyright (c) 1997-2009,2014, International Business Machines
|
||||
Copyright (C) 2000-2009, International Business Machines
|
||||
Copyright (c) 1997-2015, International Business Machines Corporation and
|
||||
Copyright (c) 1997-2013, International Business Machines Corporation and
|
||||
Copyright (c) 2001-2016, International Business Machines Corporation and
|
||||
Copyright (c) 1997-2016, International Business Machines Corporation
|
||||
Copyright (c) 1997-2003, 2007-2009 International Business Machines Corporation and
|
||||
Copyright (c) 2011-2014, International Business Machines Corporation
|
||||
Copyright (c) 2003-2009, International Business Machines
|
||||
Copyright (c) 2016, International Business Machines Corporation
|
||||
Copyright (c) 1997-2004, International Business Machines Corporation and
|
||||
Copyright (C) 2002-2016, International Business Machines
|
||||
Copyright (C) 1998-2014, International Business Machines Corporation
|
||||
Copyright (c) 2003-2013, International Business Machines Corporation and
|
||||
Copyright (c) 2005-2016, International Business Machines Corporation and
|
||||
Copyright (c) 1999-2013, International Business Machines Corporation and
|
||||
Copyright (c) 2003-2015, International Business Machines Corporation and
|
||||
Copyright (C) 2003-2016, International Business Machines
|
||||
Copyright (C) 2003-2014, International Business Machines
|
||||
Copyright (C) 2003, International Business Machines
|
||||
Copyright (c) 1998-2016, International Business Machines Corporation and
|
||||
Copyright (c) 2004-2015, International Business Machines Corporation and
|
||||
Copyright (c) 2009-2016, International Business Machines Corporation and
|
||||
Copyright (C) 2003-2012, International Business Machines
|
||||
Copyright (c) 2000-2016, International Business Machines Corporation and
|
||||
Copyright (C) 2001-2014, International Business Machines
|
||||
Copyright (C) 2001-2016, International Business Machines
|
||||
Copyright (c) 1997-2014, International Business Machines © 2017 and later: Unicode, Inc. and others.
|
||||
Copyright (C) 2007-2016, International Business Machines © 2018 and later: Unicode, Inc. and others.
|
||||
Copyright (c) 2015, International Business Machines Corporation
|
||||
Copyright (c) 2014-2016, International Business Machines Corporation
|
||||
Copyright (c) 2002-2016, International Business Machines
|
||||
Copyright (c) 2001-2011,2015 International Business Machines
|
||||
Copyright (c) 2001-2016 International Business Machines
|
||||
Copyright (c) 2005-2013, International Business Machines Corporation and
|
||||
Copyright (c) 1998-2014, International Business Machines Corporation and
|
||||
Copyright (C) 1997-2016 International Business Machines
|
||||
Copyright (C) 2009-2014, International Business Machines Corporation and
|
||||
Copyright (c) 2002-2014, International Business Machines Corporation
|
||||
Copyright (c) 2002-2007, International Business Machines Corporation
|
||||
Copyright (C) 1996-2012, International Business Machines Corporation
|
||||
Copyright (C) 1996-2008, International Business Machines Corporation
|
||||
Copyright (C) 2007-2013, International Business Machines Corporation and
|
||||
Copyright (C) 2008-2015, International Business Machines
|
||||
Copyright (C) 2003-2013, International Business Machines Corporation and
|
||||
Copyright (C) 2003-2013, International Business Machines Corporation
|
||||
Copyright (C) 1997-2016, International Business Machines Corporation and
|
||||
Copyright (C) 2001-2011, International Business Machines
|
||||
Copyright (C) 2001-2008, International Business Machines
|
||||
Copyright (C) 2003 - 2009, International Business Machines Corporation and
|
||||
Copyright (C) 2003 - 2008, International Business Machines Corporation and
|
||||
Copyright (C) 2007-2014, International Business Machines Corporation
|
||||
Copyright (C) 2007-2013, International Business Machines Corporation
|
||||
Copyright (C) 1997-2013, International Business Machines Corporation and
|
||||
Copyright (C) 1996-2014, International Business Machines Corporation and
|
||||
Copyright (C) 2010-2014, International Business Machines
|
||||
Copyright (C) 2010-2015, International Business Machines
|
||||
Copyright (C) 2013-2014, International Business Machines
|
||||
Copyright (C) 1996-2015, International Business Machines
|
||||
Copyright (C) 1996-2014, International Business Machines
|
||||
Copyright (C) 2012-2015, International Business Machines
|
||||
Copyright (C) 2012-2014, International Business Machines
|
||||
Copyright (C) 2013-2015, International Business Machines
|
||||
Copyright (C) 2013-2016, International Business Machines
|
||||
Copyright (C) 1999-2016, International Business Machines
|
||||
Copyright (C) 1999-2015, International Business Machines
|
||||
Copyright (C) 1999-2014, International Business Machines
|
||||
Copyright (C) 2015-2016, International Business Machines Corporation and others.
|
||||
Copyright (C) 2003 - 2013, International Business Machines Corporation and
|
||||
Copyright (C) 1999-2011, International Business Machines
|
||||
Copyright (C) 2005-2016, International Business Machines
|
||||
Copyright (C) 2005-2012, International Business Machines
|
||||
Copyright (C) 2005-2015, International Business Machines
|
||||
Copyright (C) 2005-2013, International Business Machines
|
||||
Copyright (C) 2005-2014, International Business Machines
|
||||
Copyright (c) 2004, International Business Machines
|
||||
Copyright (c) 2004-2014 International Business Machines
|
||||
Copyright (c) 2004-2014, International Business Machines
|
||||
Copyright (C) 2013, International Business Machines Corporation
|
||||
Copyright (C) 1997-2015, International Business Machines Corporation and
|
||||
Copyright (C) 2016, International Business Machines
|
||||
Copyright (c) IBM Corporation, 2000-2012. All rights reserved.
|
||||
Copyright (c) IBM Corporation, 2000-2011. All rights reserved.
|
||||
Copyright (c) IBM Corporation, 2000-2014. All rights reserved.
|
||||
Copyright (c) IBM Corporation, 2000-2010. All rights reserved.
|
||||
Copyright (c) IBM Corporation, 2000-2016. All rights reserved.
|
||||
Copyright 2010 the V8 project authors. All rights reserved.
|
||||
Copyright 2006-2008 the V8 project authors. All rights reserved.
|
||||
Copyright 2012 the V8 project authors. All rights reserved.
|
||||
Copyright (C) 2008-2016, International Business Machines Corporation and
|
||||
Copyright (C) 2007-2016, International Business Machines Corporation and
|
||||
Copyright (C) 2007-2012, International Business Machines Corporation and
|
||||
Copyright (c) 2001-2011, International Business Machines
|
||||
Copyright (c) 2001-2007, International Business Machines
|
||||
Copyright (C) 2010-2014, International Business Machines Corporation and
|
||||
Copyright (C) 1997-2010, International Business Machines Corporation and
|
||||
Copyright (C) 1997-2012, International Business Machines Corporation and
|
||||
Copyright (C) 2009-2015, International Business Machines Corporation and
|
||||
Copyright (C) 2009-2012, International Business Machines Corporation and
|
||||
Copyright (c) 2002-2012, International Business Machines Corporation
|
||||
Copyright (c) 2002-2011, International Business Machines Corporation
|
||||
Copyright (C) 2008-2013, International Business Machines Corporation and
|
||||
Copyright (c) 2003-2008, International Business Machines
|
||||
Copyright (C) 2003-2016, International Business Machines Corporation
|
||||
Copyright (C) 2003-2014, International Business Machines Corporation
|
||||
Copyright (C) 2003-2008, International Business Machines Corporation
|
||||
Copyright (C) 2005-2008, International Business Machines
|
||||
Copyright (C) 2003-2015, International Business Machines Corporation
|
||||
Copyright (C) 2003-2009,2012,2016 International Business Machines Corporation and
|
||||
Copyright (c) 2004-2016, International Business Machines © 2020 and later: Unicode, Inc. and others.
|
||||
Copyright (C) 2007-2008, International Business Machines Corporation and
|
||||
Copyright (C) 2001-2007, International Business Machines
|
||||
Copyright (C) 1997-2012, International Business Machines
|
||||
Copyright (C) 1997-2015, International Business Machines
|
||||
Copyright (C) 2001-2010, International Business Machines
|
||||
Copyright (c) 2000-2005, International Business Machines
|
||||
Copyright (c) 2000-2007, International Business Machines © 2019 and later: Unicode, Inc. and others.
|
||||
Copyright (C) 2010-2015, International Business Machines Corporation and
|
||||
Copyright (C) 2015, International Business Machines Corporation and
|
||||
Copyright (c) 2003-2013, International Business Machines
|
||||
Copyright (C) 2001-2012, International Business Machines
|
||||
Copyright (C) 2001-2011, International Business Machines Corporation
|
||||
Copyright (C) 2014-2016, International Business Machines
|
||||
Copyright (C) 1997-2015, International Business Machines Corporation
|
||||
Copyright (C) 1999-2007, International Business Machines
|
||||
Copyright (C) 1999-2007, International Business Machines Corporation
|
||||
Copyright (C) 1999-2011, International Business Machines Corporation
|
||||
Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (C) 2002-2016 International Business Machines Corporation and others.
|
||||
Copyright (C) 2002-2016, International Business Machines Corporation and others.
|
||||
Copyright (C) 2002-2016 International Business Machines Corporation
|
||||
Copyright (C) 2002-2015, International Business Machines Corporation and others.
|
||||
Copyright (C) 2012 International Business Machines Corporation
|
||||
Copyright (C) 2002-2015 International Business Machines Corporation
|
||||
Copyright (C) 2004-2015, International Business Machines Corporation and others.
|
||||
Copyright (C) 2003-2010, International Business Machines Corporation and others.
|
||||
Copyright (c) 2008-2011, International Business Machines Corporation and
|
||||
Copyright (c) 2008-2010, International Business Machines Corporation and
|
||||
Copyright (C) 2014-2016, International Business Machines Corporation and
|
||||
Copyright (C) 2013, International Business Machines Corporation and
|
||||
Copyright (c) 2014, International Business Machines
|
||||
Copyright (C) 2014, International Business Machines
|
||||
Copyright (C) 2013, International Business Machines
|
||||
Copyright (C) 2001-2008,2010 IBM and others. All rights reserved.
|
||||
Copyright (C) 2010 , Yahoo! Inc.
|
||||
Copyright (c) 1997-2011, International Business Machines Corporation and
|
||||
Copyright (C) 2013-2014, International Business Machines Corporation and
|
||||
Copyright (C) 2009-2013, International Business Machines Corporation and
|
||||
Copyright (C) 1996-2012, International Business Machines Corporation and
|
||||
Copyright (C) 2015, International Business Machines Corporation
|
||||
Copyright (c) 2001-2012, International Business Machines Corporation
|
||||
Copyright (C) 2001-2014 IBM and others. All rights reserved.
|
||||
Copyright (C) 2008-2014, Google, International Business Machines Corporation and
|
||||
Copyright (C) 2008, Google, International Business Machines Corporation and
|
||||
Copyright (C) 2008-2015, Google, International Business Machines Corporation
|
||||
Copyright (c) 2001-2014, International Business Machines
|
||||
Copyright (c) 2002-2010, International Business Machines Corporation
|
||||
Copyright (C) 2011-2015, International Business Machines Corporation and
|
||||
Copyright (C) 2011-2016, International Business Machines Corporation and
|
||||
Copyright (C) 2011-2012, International Business Machines Corporation and
|
||||
Copyright (C) 1996-2016, International Business Machines
|
||||
Copyright (C) 1998-2014, International Business Machines
|
||||
Copyright (C) 2004-2016, International Business Machines
|
||||
Copyright (C) 2010-2011, International Business Machines
|
||||
Copyright (C) 2009-2015, International Business Machines
|
||||
Copyright (C) 2015, International Business Machines
|
||||
Copyright (C) 2012-2016, International Business Machines
|
||||
Copyright (C) 1999-2012, International Business Machines
|
||||
Copyright (C) 2001, International Business Machines
|
||||
Copyright (C) 2013, International Business Machines Corporation and others.
|
||||
Copyright (C) 2010-2012, International Business Machines
|
||||
Copyright (C) 2004-2015, International Business Machines
|
||||
Copyright (C) 2003-2006, International Business Machines
|
||||
Copyright (C) 2013-2015, International Business Machines Corporation and others.
|
||||
Copyright (C) 2001-2015 IBM and others. All rights reserved.
|
||||
Copyright (C) 2008-2015, International Business Machines Corporation
|
||||
Copyright (C) 2008-2016, International Business Machines
|
||||
Copyright (C) 2008-2013, International Business Machines Corporation
|
||||
Copyright (C) 2004-2012, International Business Machines Corporation and
|
||||
Copyright (C) 1997-2009,2014 International Business Machines
|
||||
Copyright (C) 2009-2011, International Business Machines Corporation and
|
||||
Copyright (C) 2009-2016, International Business Machines Corporation and
|
||||
Copyright (C) 2009-2013, International Business Machines
|
||||
Copyright (C) 2008-2011, International Business Machines
|
||||
Copyright (C) 2007-2014, International Business Machines Corporation and
|
||||
Copyright (C) 2009-2010, International Business Machines Corporation and
|
||||
Copyright (C) 2001-2016 International Business Machines Corporation
|
||||
Copyright (c) 2002-2011, International Business Machines
|
||||
Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved.
|
||||
Copyright (c) 2013-2016 International Business Machines Corporation and others. All rights reserved.
|
||||
Copyright (c) 2013-2015 International Business Machines Corporation and others. All rights reserved.
|
||||
Copyright (c) 2007-2012, International Business Machines Corporation and
|
||||
Copyright (c) 2007-2012, International Business Machines
|
||||
Copyright (C) 2010, International Business Machines
|
||||
Copyright (C) 1997-2011, International Business Machines
|
||||
Copyright (C) 1997-2005, International Business Machines
|
||||
Copyright (C) 2009-2011, International Business Machines
|
||||
Copyright (C) 2003-2015, International Business Machines
|
||||
Copyright (C) 2009-2016, International Business Machines
|
||||
Copyright (C) 2008-2012, International Business Machines
|
||||
Copyright (C) 2008, International Business Machines
|
||||
Copyright (C) 2011-2014, International Business Machines
|
||||
Copyright (C) 2011-2013, International Business Machines
|
||||
Copyright (C) 2005, International Business Machines
|
||||
Copyright (C) 1999-2013, International Business Machines
|
||||
Copyright (C) 1998-2016, International Business Machines
|
||||
Copyright (c) 2007-2014, International Business Machines Corporation and
|
||||
Copyright (C) 2003-2013, International Business Machines
|
||||
Copyright (c) 2007-2016, International Business Machines Corporation and
|
||||
Copyright (c) 2008-2015, International Business Machines
|
||||
Copyright (C) 1999-2010, International Business Machines
|
||||
Copyright (C) 2000-2015, International Business Machines
|
||||
Copyright (C) 2000-2011, International Business Machines
|
||||
Copyright (C) 2000-2012, International Business Machines
|
||||
Copyright (C) 2000-2010, International Business Machines
|
||||
Copyright (C) 2004-2010, International Business Machines
|
||||
Copyright (C) 2004-2005, International Business Machines
|
||||
Copyright (c) 2013-2014, International Business Machines
|
||||
Copyright (c) 1991-2013 Unicode, Inc. © 2019 Unicode®, Inc.
|
||||
Copyright (C) 2018 and later: Unicode, Inc. and others.
|
||||
Copyright (c) 2008-2013 International Business Machines
|
||||
Copyright (C) 2002-2010, International Business Machines
|
||||
Copyright (c) 2012-2015 International Business Machines © 2020 Unicode®, Inc.
|
||||
Copyright (c) 2005-2013 IBM Corporation and others. All rights reserved
|
||||
Copyright (c) 2011-2012, International Business Machines Corporation and
|
||||
Copyright (C) 1998-2000, International Business Machines © 2017 Unicode®, Inc.
|
||||
Copyright (c) 2007-2015 International Business Machines
|
||||
Copyright (C) 2004-2006, International Business Machines
|
||||
Copyright (C) 2003-2005, International Business Machines
|
||||
Copyright (c) 1999-2014 International Business Machines
|
||||
Copyright (c) 2003, International Business Machines
|
||||
Copyright (C) 2014 International Business Machines
|
||||
Copyright (c) 2001-2003 International Business Machines
|
||||
Copyright (c) 2004-2011 International Business Machines
|
||||
Copyright (C) 2015-2016, International Business Machines
|
||||
Copyright (c) 2001-2015 International Business Machines
|
||||
Copyright (C) 2003-2012, International Business Machines Corporation and COPYRIGHT AND PERMISSION NOTICE
|
||||
Copyright (c) 2003 National Electronics and Computer Technology Center and others
|
||||
Copyright (C) 2005-2010, International Business Machines
|
||||
Copyright (c) 2007-2009 IBM Corporation and others. All rights reserved
|
||||
Copyright (C) 2004-2016 International Business Machines
|
||||
Copyright (C) 1998-2013, International Business Machines
|
||||
Copyright (C) 1998-2010, International Business Machines
|
||||
Copyright (c) 1999-2004, International Business Machines
|
||||
Copyright (C) 2002-2006 International Business Machines Corporation
|
||||
Copyright (C) 1999-2006, International Business Machines
|
||||
Copyright (C) 2002-2016 IBM, Inc. All Rights Reserved.
|
||||
Copyright (c) 2002-2006, International Business Machines(C) Copyright IBM Corp. 1998-2007 - All Rights Reserved
|
||||
Copyright (C) 1999-2003, International Business Machines
|
||||
Copyright (C) 1998-2006, International Business Machines Corporation and
|
||||
Copyright (C) 1998-2003, International Business Machines Corporation and
|
||||
Copyright (C) 2003 - 2008, International Business Machines
|
||||
Copyright (C) 1999-2008, International Business Machines
|
||||
Copyright (C) 1999-2001, International Business Machines
|
||||
Copyright (C) 1999-2005, International Business Machines
|
||||
Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
Copyright (c) 2001-2010 IBM Corporation and others. All Rights Reserved.
|
||||
Copyright (C) 1998-2005, International Business Machines Corporation and
|
||||
Copyright (C) 1998-2001, International Business Machines Corporation and
|
||||
Copyright (c) 2002-2005, International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (C) 2000-2014, International Business Machines
|
||||
Copyright (C) 1996-2013, International Business Machines
|
||||
Copyright (c) 2002-2006, International Business Machines Corporation and
|
||||
Copyright (c) 2004-2010, International Business Machines Corporation and
|
||||
Copyright (C) 2004-2011, International Business Machines
|
||||
Copyright (c) 2002-2005, International Business Machines Corporation and
|
||||
Copyright (c) 2002-2014, International Business Machines
|
||||
Copyright (c) 1997-2012, International Business Machines
|
||||
Copyright (c) 2002-2008, International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (C) 2011-2013, Apple Inc.; Unicode, Inc.; and others. All Rights Reserved.
|
||||
Copyright (C) 2011-2013, Apple Inc. and others. All Rights Reserved.
|
||||
Copyright (c) 2005-2007,2010 Apple Inc., Unicode Inc.,and others. All Rights Reserved.
|
||||
Copyright (c) 1999-2003, International Business Machines Corporation and
|
||||
Copyright (c) 2003-2014, International Business Machines
|
||||
Copyright (c) 2002-2010, International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (c) 1999-2010, International Business Machines Corporation and
|
||||
Copyright (c) 1999-2002, International Business Machines Corporation and
|
||||
Copyright (C) 2002-2003, International Business Machines
|
||||
Copyright (C) 2002, International Business Machines
|
||||
Copyright (c) 2007, International Business Machines Corporation and
|
||||
Copyright (C) 2007, International Business Machines
|
||||
Copyright (C) 2001-2006, International Business Machines
|
||||
Copyright (C) 2010-2014, International Business Machines Corporation and others.
|
||||
Copyright (C) 2005-2016, International Business Machines Corporation and
|
||||
Copyright (C) 2015-2016, International Business Machines Corporation and
|
||||
Copyright (C) 2008-2012, International Business Machines Corporation
|
||||
Copyright (c) 2006-2015 International Business Machines Corporation and others. All rights reserved.
|
||||
Copyright (c) 2014-2015 International Business Machines Corporation and others. All rights reserved.
|
||||
Copyright (C) 2002-2011, International Business Machines
|
||||
Copyright (c) 2003-2010, International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (C) 2012 IBM Corporation and Others. All Rights Reserved.
|
||||
Copyright (C) 1998-2012, International Business Machines Corporation
|
||||
Copyright (c) 2009, International Business Machines Corporation and
|
||||
Copyright (C) The Internet Society (2002). All Rights Reserved.
|
||||
Copyright (c) 2015, International Business Machines Corporation and
|
||||
Copyright (c) 2002, International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (C) 1998-2016, International Business Machines Corporation
|
||||
Copyright (c) 2011-2016,International Business Machines
|
||||
Copyright (C) 2012 International Business Machines Corporation and Others. All Rights Reserved.
|
||||
Copyright (C) 2011, International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (C) 2011, International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (c) 2011-2012,International Business Machines
|
||||
Copyright (c) 2007, International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (C) 2007-2007, International Business Machines(C) Copyright IBM Corp. 1998-2014 - All Rights Reserved
|
||||
Copyright (C) 1998-2002, International Business Machines
|
||||
Copyright (c) 2001-2007, International Business Machines Corporation and others. All Rights Reserved.(C) Copyright IBM Corp. 1998-2013 - All Rights Reserved
|
||||
Copyright (C) 1998-2015, International Business Machines
|
||||
Copyright (C) 2001-2014 International Business Machines
|
||||
Copyright (C) 2011-2016, International Business Machines
|
||||
Copyright (C) 2011-2015, International Business Machines
|
||||
Copyright (c) 1999-2014, International Business Machines Corporation and
|
||||
Copyright (c) 1999-2009, International Business Machines Corporation and
|
||||
Copyright (c) 2010,International Business Machines
|
||||
Copyright (c) 2010-2016,International Business Machines
|
||||
Copyright (c) 2002-2005, International Business Machines
|
||||
Copyright (C) 2000-2003, International Business Machines
|
||||
Copyright (c) 2008-2014, International Business Machines Corporation and
|
||||
Copyright (C) 2001 - 2005, International Business Machines
|
||||
Copyright (C) 2001-2005, International Business Machines
|
||||
Copyright (C) 1995-2014, International Business Machines
|
||||
Copyright (c) 2000-2004 IBM, Inc. and Others.
|
||||
Copyright (c) 2002-2014, International Business Machines Corporation and
|
||||
Copyright (c) 2007-2013, International Business Machines Corporation and
|
||||
Copyright (c) 2002-2012, International Business Machines Corporation and
|
||||
Copyright (C) 2002-2012, International Business Machines
|
||||
Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others.
|
||||
Copyright (c) 2002, International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (C) 2009-2014, International Business Machines
|
||||
Copyright (C) 2008, International Business Machines Corporation and others.
|
||||
Copyright (C) 2000-2016, International Business Machines
|
||||
Copyright (C) 2011-2014 International Business Machines
|
||||
Copyright (C) 1997-2014, International Business Machines
|
||||
Copyright (C) 1997-2013, International Business Machines
|
||||
Copyright (c) 2004-2006, International Business Machines
|
||||
Copyright (C) 1997-2016, International Business Machines
|
||||
Copyright (C) 1997-2006, International Business Machines
|
||||
Copyright (C) 1997-2011, International Business Machines Corporation and others.
|
||||
Copyright (C) 1997-2013, International Business Machines Corporation and others.
|
||||
Copyright (c) 2004-2015, International Business Machines
|
||||
Copyright (C) 2009-2017, International Business Machines Corporation,Google, and others. All Rights Reserved.
|
||||
Copyright (C) 1997-2016, International Business Machines Corporation and others.
|
||||
Copyright (C) 2008-2015, International Business Machines Corporation and
|
||||
Copyright (C) 1997-2015, International Business Machines Corporation and others.
|
||||
Copyright (C) 2014-2016, International Business Machines Corporation and others.
|
||||
Copyright (c) 2014-2016, International Business Machines
|
||||
Copyright (C) 2001-2011 IBM and others. All rights reserved.
|
||||
Copyright (C) 1996-2014, International Business Machines Corporation and others.
|
||||
Copyright (C) 1996-2016, International Business Machines Corporation and
|
||||
Copyright (C) 2009-2016, International Business Machines Corporation,
|
||||
Copyright (C) 2009-2010, Google, International Business Machines Corporation and
|
||||
Copyright (C) 2008-2014, Google, International Business Machines Corporation
|
||||
Copyright (C) 1996-2015, International Business Machines Corporation and
|
||||
Copyright (c) 1996-2015, International Business Machines Corporation and others.
|
||||
Copyright (C) 2010-2012,2015 International Business Machines
|
||||
Copyright (C) 2007-2015, International Business Machines
|
||||
Copyright (C) 2013-2014, International Business Machines Corporation and others.
|
||||
Copyright (C) 2010-2013, International Business Machines
|
||||
Copyright (c) 2002-2005, International Business Machines Corporation
|
||||
Copyright (C) 2001-2011,2014 IBM and others. All rights reserved.
|
||||
Copyright (C) 2008-2016, International Business Machines Corporation
|
||||
Copyright (C) 2004 - 2008, International Business Machines Corporation and
|
||||
Copyright (C) 1997-2011,2014-2015 International Business Machines
|
||||
Copyright (C) 2001-2003, International Business Machines
|
||||
Copyright (C) 1999-2009, International Business Machines
|
||||
Copyright (C) 2020 and later: Unicode, Inc. and others.
|
||||
Copyright (c) 2002, International Business Machines Corporation and
|
||||
Copyright (C) 2000-2008, International Business Machines
|
||||
Copyright (C) 1998-2006, International Business Machines
|
||||
Copyright (C) 1998-2001, International Business Machines Corporation
|
||||
Copyright (C) 1998-2004, International Business Machines Corporation
|
||||
Copyright (C) 2000, International Business Machines
|
||||
Copyright (c) 1999-2016, International Business Machines Corporation and
|
||||
Copyright (c) 2015, International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (c) 1999-2012, International Business Machines Corporation and
|
||||
Copyright (C) 1998-2011, International Business Machines
|
||||
Copyright (C) 2008-2014, International Business Machines Corporation and
|
||||
Copyright (C) 2003-2004, International Business Machines
|
||||
Copyright (c) 2003-2005, International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (C) 2002-2006 IBM, Inc. All Rights Reserved.
|
||||
Copyright (C) 2004-2008, International Business Machines
|
||||
Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
Copyright (c) 2002-2015, International Business Machines Corporation and
|
||||
Copyright (C) 2002-2016, International Business Machines Corporation
|
||||
Copyright (c) 2002-2010,International Business Machines
|
||||
Copyright (c) 2002-2014,International Business Machines
|
||||
Copyright (c) 2002-2016,International Business Machines
|
||||
Copyright (C) 2016 International Business Machines Corporation
|
||||
Copyright © 2019 and later: Unicode, Inc. and others.
|
||||
Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (c) 2016 International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (c) 2015-2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (c) 2005-2006, International Business Machines Corporation and
|
||||
Copyright (c) 1997-2004, International Business Machines Corporation
|
||||
Copyright (c) 2012-2016, International Business Machines Corporation
|
||||
Copyright (c) 2012-2014, International Business Machines Corporation and
|
||||
Copyright (c) 1997-2014, International Business Machines Corporation
|
||||
Copyright (c) 1996-2016, International Business Machines Corporation and
|
||||
Copyright (c) 2003-2013, International Business Machines Corporation
|
||||
Copyright (c) 2003-2008, International Business Machines Corporation
|
||||
Copyright (c) 1997-2015, International Business Machines Corporation
|
||||
Copyright (c) 2002-2016, International Business Machines Corporation and
|
||||
Copyright (c) 1997-2002, International Business Machines Corporation and
|
||||
Copyright (C) 1996-2012, International Business Machines
|
||||
Copyright (c) 1997-2013 International Business Machines Corporation and
|
||||
Copyright (c) 2010-2012, International Business Machines Corporation and
|
||||
Copyright (c) 1997-2011, International Business Machines Corporation
|
||||
Copyright (c) 1997-2006, International Business Machines Corporation and
|
||||
Copyright (c) 2008-2016 International Business Machines Corporation and
|
||||
Copyright (c) 2008-2016, International Business Machines Corporation and
|
||||
Copyright (c) 1997-2016 International Business Machines Corporation and
|
||||
Copyright (c) 2007-2011, International Business Machines
|
||||
Copyright (c) 2007-2010, International Business Machines
|
||||
Copyright (C) 2001-2016, International Business Machines Corporation and
|
||||
Copyright (C) 2001-2003, International Business Machines Corporation and
|
||||
Copyright (C) 2003-2011, International Business Machines
|
||||
Copyright (c) 1997-2007, International Business Machines Corporation and
|
||||
Copyright (c) 1997-2015, International Business Machines
|
||||
Copyright (C) 2004-2009, International Business Machines Corporation and
|
||||
Copyright (C) 2004, International Business Machines Corporation and
|
||||
Copyright (C) 1996-2009, International Business Machines Corporation and
|
||||
Copyright (C) 1996-2006, International Business Machines Corporation and
|
||||
Copyright (C) 2011-2013, International Business Machines Corporation
|
||||
Copyright (C) 2000-2007, International Business Machines
|
||||
Copyright (c) 2001, International Business Machines Corporation and
|
||||
Copyright (C) 2012-2013, International Business Machines
|
||||
Copyright (c) 2010-2016, International Business Machines Corporation and
|
||||
Copyright (c) 2010-2016, International Business Machines Corporation
|
||||
Copyright (c) 1997-2010, International Business Machines Corporation
|
||||
Copyright (c) 1997-2003, International Business Machines
|
||||
Copyright (C) 2014-2015, International Business Machines Corporation and
|
||||
Copyright (c) 1997-2013, International Business Machines Corporation
|
||||
Copyright (c) 1999-2016, International Business Machines
|
||||
Copyright (c) 1999-2016 International Business Machines Corporation and
|
||||
Copyright (c) 2016, International Business Machines Corporation and
|
||||
Copyright (c) 2016, International Business Machines
|
||||
Copyright (c) 2013-2016, International Business Machines Corporation
|
||||
Copyright (c) 2013, International Business Machines Corporation
|
||||
Copyright (C) 2013-2016, International Business Machines Corporation and
|
||||
Copyright (c) 2001-2010, International Business Machines Corporation and
|
||||
Copyright (C) 2014, International Business Machines Corporation and
|
||||
Copyright (c) 1999-2015, International Business Machines Corporation and
|
||||
Copyright (C) 2001-2016, International Business Machines orporation
|
||||
Copyright (c) 2001-2008, International Business Machines Corporation and others
|
||||
Copyright (C) 2003-2016, International Business Machines Corporation and
|
||||
Copyright (c) 2004, International Business Machines Corporation
|
||||
Copyright (C) 2001-2009, International Business Machines
|
||||
Copyright (c) 2004,2011 International Business Machines
|
||||
Copyright (c) 2004-2011, International Business Machines
|
||||
Copyright (c) 2000-2016, International Business Machines Corporation
|
||||
Copyright (c) 2001-2005, International Business Machines Corporation and
|
||||
Copyright (C) 2001-2004, International Business Machines
|
||||
Copyright (c) 2001-2009, International Business Machines
|
||||
Copyright (c) 1997-2009, International Business Machines Corporation
|
||||
Copyright (c) 1997-2013, International Business Machines
|
||||
Copyright (c) 1997-2012, International Business Machines Corporation
|
||||
Copyright (C) 2007-2015, International Business Machines Corporation and
|
||||
Copyright (C) 2007-2011, International Business Machines Corporation and
|
||||
Copyright (C) 2007, International Business Machines Corporation and
|
||||
Copyright (c) 1998-2005, International Business Machines Corporation and
|
||||
Copyright (c) 2002-2010, International Business Machines Corporation and
|
||||
Copyright (C) 1999-2016 International Business Machines Corporation and
|
||||
Copyright (c) 2004-2011, International Business Machines Corporation and
|
||||
Copyright (c) 2002-2007, International Business Machines Corporation and
|
||||
Copyright (C) 2003, International Business Machines Corporation and
|
||||
Copyright (C) 2005-2011, International Business Machines
|
||||
Copyright (C) 2011-2012, International Business Machines
|
||||
Copyright (C) 2007-2012, International Business Machines
|
||||
Copyright (C) 2006-2016, International Business Machines Corporation
|
||||
Copyright (C) 2006-2012, International Business Machines Corporation and others.
|
||||
Copyright 2007 Google Inc. All Rights Reserved.
|
||||
Copyright (c) 2001-2015, International Business Machines
|
||||
Copyright (C) 2006-2014, International Business Machines Corporation
|
||||
Copyright (C) 2008, International Business Machines Corporation and
|
||||
Copyright (C) 2009-2012, International Business Machines
|
||||
Copyright (C) 2006 International Business Machines Corporation
|
||||
Copyright (C) 2010-2016, International Business Machines Corporation and
|
||||
Copyright (C) 2002-2014, International Business Machines Corporation and
|
||||
Copyright (C) 2002-2005, International Business Machines Corporation and
|
||||
Copyright (C) 2011, International Business Machines
|
||||
Copyright (c) 2003-2010 International Business Machines
|
||||
Copyright (C) 2003-2003, International Business Machines
|
||||
Copyright (C) 1999-2016 International Business Machines Corporation
|
||||
Copyright (C) 1999-2014 International Business Machines Corporation
|
||||
Copyright (C) 1999-2014 International Business Machines
|
||||
Copyright (C) 2002-2011, International Business Machines Corporation and others.
|
||||
Copyright (C) 2002-2008, International Business Machines Corporation and others.
|
||||
Copyright (C) 2002-2008 International Business Machines Corporation
|
||||
Copyright (c) 2001-2005, International Business Machines
|
||||
Copyright (C) 2002-2014 International Business Machines Corporation
|
||||
Copyright (c) 2003-2011, International Business Machines
|
||||
Copyright (C) 1998-2012, International Business Machines Corporation and
|
||||
Copyright (C) 2001-2014, International Business Machines Corporation.
|
||||
Copyright (C) 2001-2011, International Business Machines Corporation.
|
||||
Copyright (C) 2001-2014, International Business Machines Corporation and
|
||||
Copyright (C) 2001-2011, International Business Machines Corporation and
|
||||
Copyright (C) 2001-2012, International Business Machines Corporation and
|
||||
Copyright 2004 and onwards Google Inc.
|
||||
Copyright (C) 2004-2014, International Business Machines
|
||||
Copyright (C) 2006, International Business Machines
|
||||
Copyright (C) 2004-2012, International Business Machines
|
||||
Copyright (C) 2001-2013, International Business Machines
|
||||
Copyright (C) 1998-2004, International Business Machines
|
||||
Copyright (C) 2000-2013, International Business Machines
|
||||
Copyright (C) 1999-2015 International Business Machines
|
||||
Copyright (C) 2000-2006, International Business Machines
|
||||
Copyright (C) 1999-2004, International Business Machines
|
||||
Copyright (C) 2003-2007, International Business Machines
|
||||
Copyright (C) 2002-2006, International Business Machines
|
||||
Copyright (C) 2001-2015, International Business Machines
|
||||
Copyright (c) 2001-2012, International Business Machines
|
||||
Copyright (c) 2002-2004, International Business Machines
|
||||
Copyright (C) 1999-2016, International Business Machines Corporation and
|
||||
Copyright (c) 1996-2014, International Business Machines
|
||||
Copyright (C) 1999-2016, International Business Machines Corporation
|
||||
Copyright (C) 2009-2014 International Business Machines
|
||||
Copyright (C) 2004-2007, International Business Machines
|
||||
Copyright (c) 2001-2016, International Business Machines
|
||||
Copyright (C) 2003-2009, International Business Machines
|
||||
Copyright (C) 1999-2013, International Business Machines Corporation and
|
||||
Copyright (C) 1999-2015, International Business Machines Corporation and
|
||||
Copyright (c) 2002-2011, International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (C) 2001-2016 IBM, Inc. All Rights Reserved.
|
||||
Copyright (C) 1999-2016 International Business Machines
|
||||
Copyright (C) 2009-2010 IBM Corporation and Others. All Rights Reserved.
|
||||
Copyright (C) 1998-2012, International Business Machines
|
||||
Copyright (C) 1991 and later: Unicode, Inc. and others.
|
||||
Copyright (C) 1997-2000, International Business Machines
|
||||
Copyright (c) 1999-2007, International Business Machines Corporation and
|
||||
Copyright (c) 2000 IBM, Inc. and Others.
|
||||
Copyright (C) 2008-2013, International Business Machines
|
||||
Copyright (C) 1998-2003, 2006, International Business Machines Corporation
|
||||
Copyright (c) 2002-2003,International Business Machines
|
||||
Copyright (C) 2009 International Business Machines
|
||||
Copyright (C) 2010-2016 International Business Machines
|
||||
Copyright (C) 2008-2012 IBM, Inc. All Rights Reserved.
|
||||
Copyright (C) 1998-2008, International Business Machines
|
||||
Copyright (C) 2010-2016, International Business Machines
|
||||
Copyright (C) 1999-2006,2013 IBM Corp. All rights reserved.
|
||||
Copyright (C) 2008-2009, International Business Machines Corporation and
|
||||
Copyright (C) 2012,2014 International Business Machines
|
||||
Copyright (c) 1996-2015, International Business Machines Corporation and
|
||||
Copyright (C) 1997-2005, International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (C) 1999-2012, International Business Machines Corporation and
|
||||
Copyright (C) 1996-2013, International Business Machines Corporation
|
||||
Copyright (C) 1998-2005, International Business Machines
|
||||
Copyright 2001 and onwards Google Inc.
|
||||
Copyright (C) 2010-2012,2014, International Business Machines
|
||||
Copyright (C) 1996-2015, International Business Machines Corporation and others.
|
||||
Copyright (c) 2003-2004, International Business Machines
|
||||
Copyright (C) 2000-2004, International Business Machines
|
||||
Copyright (C) 2002-2013, International Business Machines
|
||||
Copyright (C) 2002-2011 International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (C) 1999-2010, International Business Machines Corporation and others.
|
||||
Copyright (C) 2001-2005, International Business Machines Corporation and others. All Rights Reserved.
|
||||
Copyright (c) 1996-2016, International Business Machines Corporation
|
||||
Copyright (C) 1997-2010, International Business Machines
|
||||
|
||||
Software: opencv 4.2.0
|
||||
Copyright notice:
|
||||
Copyright (C) 2016, NVIDIA Corporation, all rights reserved.
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
set(LIB_ICU_COMMON icuuc)
|
||||
set(LIB_ICU_DATA icudata)
|
||||
set(LIB_ICU_I18N icui18n)
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Windows")
|
||||
message("icu4c thirdparty do not support windows currently.")
|
||||
else()
|
||||
mindspore_add_pkg(icu4c
|
||||
VER 67.1
|
||||
LIBS ${LIB_ICU_COMMON} ${LIB_ICU_DATA} ${LIB_ICU_I18N}
|
||||
URL https://github.com/unicode-org/icu/archive/release-67-1.tar.gz
|
||||
MD5 0c2662a2b0bc80b0eb56495205247c8f
|
||||
CONFIGURE_COMMAND ./icu4c/source/runConfigureICU Linux --enable-tests=no --enable-samples=no --enable-icuio=no --enable-extras=no ICU_DATA_FILTER_FILE=${CMAKE_SOURCE_DIR}/third_party/icu4c/filter.json
|
||||
)
|
||||
include_directories(${icu4c_INC})
|
||||
add_library(mindspore::icuuc ALIAS icu4c::${LIB_ICU_COMMON})
|
||||
add_library(mindspore::icudata ALIAS icu4c::${LIB_ICU_DATA})
|
||||
add_library(mindspore::icui18n ALIAS icu4c::${LIB_ICU_I18N})
|
||||
add_definitions(-D ENABLE_ICU4C)
|
||||
endif()
|
|
@ -54,6 +54,7 @@ elseif(ENABLE_D OR ENABLE_TESTCASES)
|
|||
endif()
|
||||
|
||||
if (ENABLE_MINDDATA)
|
||||
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/icu4c.cmake)
|
||||
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/jpeg_turbo.cmake)
|
||||
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/libtiff.cmake)
|
||||
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/opencv.cmake)
|
||||
|
|
|
@ -91,7 +91,20 @@ if (ENABLE_MINDDATA)
|
|||
DESTINATION ${INSTALL_LIB_DIR}
|
||||
COMPONENT mindspore
|
||||
)
|
||||
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Windows")
|
||||
message("icu4c does not support windows system temporarily")
|
||||
else()
|
||||
file(GLOB_RECURSE ICU4C_LIB_LIST
|
||||
${icu4c_LIBPATH}/libicuuc*
|
||||
${icu4c_LIBPATH}/libicudata*
|
||||
${icu4c_LIBPATH}/libicui18n*
|
||||
)
|
||||
install(
|
||||
FILES ${ICU4C_LIB_LIST}
|
||||
DESTINATION ${INSTALL_LIB_DIR}
|
||||
COMPONENT mindspore
|
||||
)
|
||||
endif()
|
||||
endif ()
|
||||
|
||||
if (ENABLE_CPU)
|
||||
|
|
|
@ -108,10 +108,11 @@ target_link_libraries(_c_dataengine PRIVATE mindspore mindspore_gvar)
|
|||
if (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
|
||||
target_link_libraries(_c_dataengine PRIVATE mindspore::pybind11_module ${PYTHON_LIBRARIES} mindspore::protobuf ${SECUREC_LIBRARY})
|
||||
else()
|
||||
set(ICU_LIB mindspore::icuuc mindspore::icudata mindspore::icui18n)
|
||||
target_link_libraries(_c_dataengine PRIVATE mindspore::pybind11_module -ldl mindspore::protobuf ${SECUREC_LIBRARY})
|
||||
endif()
|
||||
target_link_libraries(_c_dataengine PUBLIC mindspore::jpeg_turbo mindspore::opencv_core mindspore::opencv_imgcodecs
|
||||
mindspore::opencv_imgproc mindspore::tinyxml2)
|
||||
mindspore::opencv_imgproc mindspore::tinyxml2 ${ICU_LIB})
|
||||
if (ENABLE_GPUQUE)
|
||||
target_link_libraries(_c_dataengine PRIVATE gpu_queue
|
||||
${CUDNN_PATH}/lib64/libcudnn.so
|
||||
|
|
|
@ -65,8 +65,21 @@
|
|||
#include "dataset/text/kernels/jieba_tokenizer_op.h"
|
||||
#include "dataset/text/kernels/ngram_op.h"
|
||||
#include "dataset/text/kernels/unicode_char_tokenizer_op.h"
|
||||
#include "dataset/text/kernels/wordpiece_tokenizer_op.h"
|
||||
#include "dataset/text/vocab.h"
|
||||
#include "dataset/text/kernels/lookup_op.h"
|
||||
|
||||
#ifdef ENABLE_ICU4C
|
||||
#include "dataset/text/kernels/basic_tokenizer_op.h"
|
||||
#include "dataset/text/kernels/bert_tokenizer_op.h"
|
||||
#include "dataset/text/kernels/case_fold_op.h"
|
||||
#include "dataset/text/kernels/normalize_utf8_op.h"
|
||||
#include "dataset/text/kernels/regex_replace_op.h"
|
||||
#include "dataset/text/kernels/regex_tokenizer_op.h"
|
||||
#include "dataset/text/kernels/unicode_script_tokenizer_op.h"
|
||||
#include "dataset/text/kernels/whitespace_tokenizer_op.h"
|
||||
#endif
|
||||
|
||||
#include "dataset/util/random.h"
|
||||
#include "mindrecord/include/shard_operator.h"
|
||||
#include "mindrecord/include/shard_pk_sample.h"
|
||||
|
@ -485,7 +498,7 @@ void bindTensorOps4(py::module *m) {
|
|||
py::arg("fillR") = PadOp::kDefFillR, py::arg("fillG") = PadOp::kDefFillG, py::arg("fillB") = PadOp::kDefFillB);
|
||||
}
|
||||
|
||||
void bindTensorOps5(py::module *m) {
|
||||
void bindTokenizerOps(py::module *m) {
|
||||
(void)py::class_<JiebaTokenizerOp, TensorOp, std::shared_ptr<JiebaTokenizerOp>>(*m, "JiebaTokenizerOp", "")
|
||||
.def(py::init<const std::string, std::string, JiebaMode>(), py::arg("hmm_path"), py::arg("mp_path"),
|
||||
py::arg("mode") = JiebaMode::kMix)
|
||||
|
@ -503,6 +516,55 @@ void bindTensorOps5(py::module *m) {
|
|||
const std::string &>(),
|
||||
py::arg("ngrams"), py::arg("l_pad_len"), py::arg("r_pad_len"), py::arg("l_pad_token"), py::arg("r_pad_token"),
|
||||
py::arg("separator"));
|
||||
(void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>(
|
||||
*m, "WordpieceTokenizerOp", "Tokenize scalar token or 1-D tokens to subword tokens.")
|
||||
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &>(),
|
||||
py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
|
||||
py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
|
||||
py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken));
|
||||
}
|
||||
|
||||
void bindDependIcuTokenizerOps(py::module *m) {
|
||||
#ifdef ENABLE_ICU4C
|
||||
(void)py::class_<WhitespaceTokenizerOp, TensorOp, std::shared_ptr<WhitespaceTokenizerOp>>(
|
||||
*m, "WhitespaceTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces.")
|
||||
.def(py::init<>());
|
||||
(void)py::class_<UnicodeScriptTokenizerOp, TensorOp, std::shared_ptr<UnicodeScriptTokenizerOp>>(
|
||||
*m, "UnicodeScriptTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.")
|
||||
.def(py::init<>())
|
||||
.def(py::init<bool>(), py::arg("keep_whitespace") = UnicodeScriptTokenizerOp::kDefKeepWhitespace);
|
||||
(void)py::class_<CaseFoldOp, TensorOp, std::shared_ptr<CaseFoldOp>>(
|
||||
*m, "CaseFoldOp", "Apply case fold operation on utf-8 string tensor")
|
||||
.def(py::init<>());
|
||||
(void)py::class_<NormalizeUTF8Op, TensorOp, std::shared_ptr<NormalizeUTF8Op>>(
|
||||
*m, "NormalizeUTF8Op", "Apply normalize operation on utf-8 string tensor.")
|
||||
.def(py::init<>())
|
||||
.def(py::init<NormalizeForm>(), py::arg("normalize_form") = NormalizeUTF8Op::kDefNormalizeForm);
|
||||
(void)py::class_<RegexReplaceOp, TensorOp, std::shared_ptr<RegexReplaceOp>>(
|
||||
*m, "RegexReplaceOp", "Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'.")
|
||||
.def(py::init<const std::string &, const std::string &, bool>(), py::arg("pattern"), py::arg("replace"),
|
||||
py::arg("replace_all"));
|
||||
(void)py::class_<RegexTokenizerOp, TensorOp, std::shared_ptr<RegexTokenizerOp>>(
|
||||
*m, "RegexTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by regex expression pattern.")
|
||||
.def(py::init<const std::string &, const std::string &>(), py::arg("delim_pattern"), py::arg("keep_delim_pattern"));
|
||||
(void)py::class_<BasicTokenizerOp, TensorOp, std::shared_ptr<BasicTokenizerOp>>(
|
||||
*m, "BasicTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by specific rules.")
|
||||
.def(py::init<bool, bool, NormalizeForm, bool>(), py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
|
||||
py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
|
||||
py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
|
||||
py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken);
|
||||
(void)py::class_<BertTokenizerOp, TensorOp, std::shared_ptr<BertTokenizerOp>>(*m, "BertTokenizerOp",
|
||||
"Tokenizer used for Bert text process.")
|
||||
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, bool, bool,
|
||||
NormalizeForm, bool>(),
|
||||
py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
|
||||
py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
|
||||
py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken),
|
||||
py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
|
||||
py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
|
||||
py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
|
||||
py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken);
|
||||
#endif
|
||||
}
|
||||
|
||||
void bindSamplerOps(py::module *m) {
|
||||
|
@ -715,6 +777,16 @@ PYBIND11_MODULE(_c_dataengine, m) {
|
|||
.value("DE_JIEBA_HMM", JiebaMode::kHmm)
|
||||
.export_values();
|
||||
|
||||
#ifdef ENABLE_ICU4C
|
||||
(void)py::enum_<NormalizeForm>(m, "NormalizeForm", py::arithmetic())
|
||||
.value("DE_NORMALIZE_NONE", NormalizeForm::kNone)
|
||||
.value("DE_NORMALIZE_NFC", NormalizeForm::kNfc)
|
||||
.value("DE_NORMALIZE_NFKC", NormalizeForm::kNfkc)
|
||||
.value("DE_NORMALIZE_NFD", NormalizeForm::kNfd)
|
||||
.value("DE_NORMALIZE_NFKD", NormalizeForm::kNfkd)
|
||||
.export_values();
|
||||
#endif
|
||||
|
||||
(void)py::enum_<InterpolationMode>(m, "InterpolationMode", py::arithmetic())
|
||||
.value("DE_INTER_LINEAR", InterpolationMode::kLinear)
|
||||
.value("DE_INTER_CUBIC", InterpolationMode::kCubic)
|
||||
|
@ -734,12 +806,13 @@ PYBIND11_MODULE(_c_dataengine, m) {
|
|||
bindTensorOps2(&m);
|
||||
bindTensorOps3(&m);
|
||||
bindTensorOps4(&m);
|
||||
bindTensorOps5(&m);
|
||||
bindTokenizerOps(&m);
|
||||
bindSamplerOps(&m);
|
||||
bindDatasetOps(&m);
|
||||
bindInfoObjects(&m);
|
||||
bindVocabObjects(&m);
|
||||
bindGraphData(&m);
|
||||
bindDependIcuTokenizerOps(&m);
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -1,8 +1,21 @@
|
|||
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
|
||||
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
|
||||
if (NOT (CMAKE_SYSTEM_NAME MATCHES "Windows"))
|
||||
set(ICU_DEPEND_FILES
|
||||
basic_tokenizer_op.cc
|
||||
bert_tokenizer_op.cc
|
||||
case_fold_op.cc
|
||||
normalize_utf8_op.cc
|
||||
regex_replace_op.cc
|
||||
regex_tokenizer_op.cc
|
||||
unicode_script_tokenizer_op.cc
|
||||
whitespace_tokenizer_op.cc)
|
||||
endif()
|
||||
add_library(text-kernels OBJECT
|
||||
lookup_op.cc
|
||||
jieba_tokenizer_op.cc
|
||||
unicode_char_tokenizer_op.cc
|
||||
ngram_op.cc
|
||||
wordpiece_tokenizer_op.cc
|
||||
${ICU_DEPEND_FILES}
|
||||
)
|
||||
|
|
|
@ -0,0 +1,93 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "dataset/text/kernels/basic_tokenizer_op.h"
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
const bool BasicTokenizerOp::kDefLowerCase = false;
|
||||
const bool BasicTokenizerOp::kDefKeepWhitespace = false;
|
||||
const NormalizeForm BasicTokenizerOp::kDefNormalizationForm = NormalizeForm::kNone;
|
||||
const bool BasicTokenizerOp::kDefPreserveUnusedToken = true;
|
||||
const char BasicTokenizerOp::kCommonPattern[] =
|
||||
"[!-/]"
|
||||
"|[:-@]"
|
||||
"|[\\[-`]"
|
||||
"|[{-~]"
|
||||
"|[\\p{P}]"
|
||||
"|[\\x{4E00}-\\x{9FFF}]"
|
||||
"|[\\x{3400}-\\x{4DBF}]"
|
||||
"|[\\x{20000}-\\x{2A6DF}]"
|
||||
"|[\\x{2A700}-\\x{2B73F}]"
|
||||
"|[\\x{2B740}-\\x{2B81F}]"
|
||||
"|[\\x{2B820}-\\x{2CEAF}]"
|
||||
"|[\\x{F900}-\\x{FAFF}]"
|
||||
"|[\\x{2F800}-\\x{2FA1F}]";
|
||||
const char BasicTokenizerOp::kUnusedPattern[] = "\\[CLS\\]|\\[SEP\\]|\\[UNK\\]|\\[PAD\\]|\\[MASK\\]|";
|
||||
|
||||
BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, NormalizeForm normalization_form,
|
||||
bool preserve_unused_token)
|
||||
: lower_case_(lower_case),
|
||||
keep_whitespace_(keep_whitespace),
|
||||
preserve_unused_token_(preserve_unused_token),
|
||||
case_fold_(std::make_unique<CaseFoldOp>()),
|
||||
nfd_normalize_(std::make_unique<NormalizeUTF8Op>(NormalizeForm::kNfd)),
|
||||
common_normalize_(std::make_unique<NormalizeUTF8Op>(normalization_form)),
|
||||
replace_accent_chars_(std::make_unique<RegexReplaceOp>("\\p{Mn}", "")),
|
||||
replace_control_chars_(std::make_unique<RegexReplaceOp>("\\p{Cc}|\\p{Cf}", " ")) {
|
||||
std::string delim_pattern = std::string("\\s+|") + kCommonPattern;
|
||||
std::string keep_delim_pattern;
|
||||
if (keep_whitespace_) {
|
||||
keep_delim_pattern = delim_pattern;
|
||||
} else {
|
||||
keep_delim_pattern = kCommonPattern;
|
||||
}
|
||||
if (preserve_unused_token_) {
|
||||
keep_delim_pattern = kUnusedPattern + keep_delim_pattern;
|
||||
delim_pattern = kUnusedPattern + delim_pattern;
|
||||
}
|
||||
regex_tokenizer_ = std::make_unique<RegexTokenizerOp>(delim_pattern, keep_delim_pattern);
|
||||
}
|
||||
|
||||
Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
|
||||
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
|
||||
}
|
||||
std::shared_ptr<Tensor> cur_input;
|
||||
std::shared_ptr<Tensor> processed_tensor;
|
||||
if (lower_case_) {
|
||||
// to lower case
|
||||
RETURN_IF_NOT_OK(case_fold_->Compute(input, &processed_tensor));
|
||||
cur_input = processed_tensor;
|
||||
// strip accent characters
|
||||
RETURN_IF_NOT_OK(nfd_normalize_->Compute(cur_input, &processed_tensor));
|
||||
cur_input = processed_tensor;
|
||||
RETURN_IF_NOT_OK(replace_accent_chars_->Compute(cur_input, &processed_tensor));
|
||||
} else {
|
||||
RETURN_IF_NOT_OK(common_normalize_->Compute(input, &processed_tensor));
|
||||
}
|
||||
// strip control characters
|
||||
cur_input = processed_tensor;
|
||||
RETURN_IF_NOT_OK(replace_control_chars_->Compute(cur_input, &processed_tensor));
|
||||
return regex_tokenizer_->Compute(processed_tensor, output);
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,64 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
|
||||
#define DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "dataset/core/tensor.h"
|
||||
#include "dataset/kernels/tensor_op.h"
|
||||
#include "dataset/text/kernels/case_fold_op.h"
|
||||
#include "dataset/text/kernels/normalize_utf8_op.h"
|
||||
#include "dataset/text/kernels/regex_replace_op.h"
|
||||
#include "dataset/text/kernels/regex_tokenizer_op.h"
|
||||
#include "dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
class BasicTokenizerOp : public TensorOp {
|
||||
public:
|
||||
static const bool kDefLowerCase;
|
||||
static const bool kDefKeepWhitespace;
|
||||
static const NormalizeForm kDefNormalizationForm;
|
||||
static const bool kDefPreserveUnusedToken;
|
||||
BasicTokenizerOp(bool lower_case = kDefLowerCase, bool keep_whitespace = kDefKeepWhitespace,
|
||||
NormalizeForm normalization_form = kDefNormalizationForm,
|
||||
bool preserve_unused_token = kDefPreserveUnusedToken);
|
||||
|
||||
~BasicTokenizerOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "BasicTokenizerOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
|
||||
private:
|
||||
static const char kCommonPattern[];
|
||||
static const char kUnusedPattern[];
|
||||
bool lower_case_;
|
||||
bool keep_whitespace_;
|
||||
NormalizeForm normalization_form_;
|
||||
bool preserve_unused_token_;
|
||||
std::unique_ptr<CaseFoldOp> case_fold_;
|
||||
std::unique_ptr<NormalizeUTF8Op> nfd_normalize_;
|
||||
std::unique_ptr<NormalizeUTF8Op> common_normalize_;
|
||||
std::unique_ptr<RegexReplaceOp> replace_accent_chars_;
|
||||
std::unique_ptr<RegexReplaceOp> replace_control_chars_;
|
||||
std::unique_ptr<RegexTokenizerOp> regex_tokenizer_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
|
|
@ -0,0 +1,27 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "dataset/text/kernels/bert_tokenizer_op.h"
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
Status BertTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
std::shared_ptr<Tensor> basic_tensor;
|
||||
RETURN_IF_NOT_OK(basic_tokenizer_.Compute(input, &basic_tensor));
|
||||
RETURN_IF_NOT_OK(wordpiece_tokenizer_.Compute(basic_tensor, output));
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,54 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_
|
||||
#define DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "dataset/core/tensor.h"
|
||||
#include "dataset/kernels/tensor_op.h"
|
||||
#include "dataset/text/kernels/basic_tokenizer_op.h"
|
||||
#include "dataset/text/kernels/wordpiece_tokenizer_op.h"
|
||||
#include "dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
class BertTokenizerOp : public TensorOp {
|
||||
public:
|
||||
BertTokenizerOp(const std::shared_ptr<Vocab> &vocab,
|
||||
const std::string &suffix_indicator = WordpieceTokenizerOp::kDefSuffixIndicator,
|
||||
const int &max_bytes_per_token = WordpieceTokenizerOp::kDefMaxBytesPerToken,
|
||||
const std::string &unknown_token = WordpieceTokenizerOp::kDefUnknownToken,
|
||||
bool lower_case = BasicTokenizerOp::kDefLowerCase,
|
||||
bool keep_whitespace = BasicTokenizerOp::kDefKeepWhitespace,
|
||||
NormalizeForm normalization_form = BasicTokenizerOp::kDefNormalizationForm,
|
||||
bool preserve_unused_token = BasicTokenizerOp::kDefPreserveUnusedToken)
|
||||
: wordpiece_tokenizer_(vocab, suffix_indicator, max_bytes_per_token, unknown_token),
|
||||
basic_tokenizer_(lower_case, keep_whitespace, normalization_form, preserve_unused_token) {}
|
||||
|
||||
~BertTokenizerOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "BertTokenizerOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
|
||||
private:
|
||||
WordpieceTokenizerOp wordpiece_tokenizer_;
|
||||
BasicTokenizerOp basic_tokenizer_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // DATASET_TEXT_KERNELS_BERT_TOKENIZER_OP_H_
|
|
@ -0,0 +1,46 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "dataset/text/kernels/case_fold_op.h"
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
Status CaseFoldOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
icu::ErrorCode error;
|
||||
const icu::Normalizer2 *nfkc_case_fold = icu::Normalizer2::getNFKCCasefoldInstance(error);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKCCasefoldInstance failed.");
|
||||
std::vector<std::string> strs(input->Size());
|
||||
int i = 0;
|
||||
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
|
||||
icu::StringByteSink<std::string> sink(&strs[i++]);
|
||||
nfkc_case_fold->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "normalizeUTF8 failed.");
|
||||
}
|
||||
*output = std::make_shared<Tensor>(std::move(strs), input->shape());
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,39 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_
|
||||
#define DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_
|
||||
#include <memory>
|
||||
|
||||
#include "dataset/core/tensor.h"
|
||||
#include "dataset/kernels/tensor_op.h"
|
||||
#include "dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
class CaseFoldOp : public TensorOp {
|
||||
public:
|
||||
CaseFoldOp() {}
|
||||
|
||||
~CaseFoldOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "CaseFoldOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // DATASET_TEXT_KERNELS_CASE_FOLD_OP_H_
|
|
@ -29,6 +29,7 @@ JiebaTokenizerOp::JiebaTokenizerOp(const std::string &hmm_path, const std::strin
|
|||
}
|
||||
|
||||
Status JiebaTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
RETURN_UNEXPECTED_IF_NULL(jieba_parser_);
|
||||
|
||||
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
|
||||
|
|
|
@ -24,6 +24,7 @@ LookupOp::LookupOp(std::shared_ptr<Vocab> vocab, WordIdType default_id)
|
|||
: vocab_(vocab), default_id_(default_id), type_(DataType("int32")) {}
|
||||
|
||||
Status LookupOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
RETURN_UNEXPECTED_IF_NULL(vocab_);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "None String Tensor");
|
||||
std::vector<WordIdType> word_ids;
|
||||
|
|
|
@ -34,6 +34,7 @@ NgramOp::NgramOp(const std::vector<int32_t> &ngrams, int32_t l_len, int32_t r_le
|
|||
separator_(separator) {}
|
||||
|
||||
Status NgramOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING && input->Rank() == 1, "Not a 1-D str Tensor");
|
||||
std::vector<int32_t> offsets; // offsets for each str
|
||||
std::vector<std::string> res; // holds the result of ngrams
|
||||
|
|
|
@ -0,0 +1,75 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "dataset/text/kernels/normalize_utf8_op.h"
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
const NormalizeForm NormalizeUTF8Op::kDefNormalizeForm = NormalizeForm::kNfkc;
|
||||
Status NormalizeUTF8Op::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
icu::ErrorCode error;
|
||||
const icu::Normalizer2 *normalize = nullptr;
|
||||
switch (normalize_form_) {
|
||||
case NormalizeForm::kNone: {
|
||||
*output = input;
|
||||
return Status::OK();
|
||||
}
|
||||
case NormalizeForm::kNfc: {
|
||||
normalize = icu::Normalizer2::getNFCInstance(error);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFCInstance failed");
|
||||
break;
|
||||
}
|
||||
case NormalizeForm::kNfkc: {
|
||||
normalize = icu::Normalizer2::getNFKCInstance(error);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKCInstance failed");
|
||||
break;
|
||||
}
|
||||
case NormalizeForm::kNfd: {
|
||||
normalize = icu::Normalizer2::getNFDInstance(error);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFDInstance failed");
|
||||
break;
|
||||
}
|
||||
case NormalizeForm::kNfkd: {
|
||||
normalize = icu::Normalizer2::getNFKDInstance(error);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "getNFKDInstance failed");
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
RETURN_STATUS_UNEXPECTED("unexpected normalize form");
|
||||
break;
|
||||
}
|
||||
}
|
||||
std::vector<std::string> strs(input->Size());
|
||||
int i = 0;
|
||||
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
|
||||
icu::StringByteSink<std::string> sink(&strs[i++]);
|
||||
normalize->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "normalizeUTF8 failed.");
|
||||
}
|
||||
*output = std::make_shared<Tensor>(std::move(strs), input->shape());
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,50 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_
|
||||
#define DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_
|
||||
#include <memory>
|
||||
|
||||
#include "dataset/core/tensor.h"
|
||||
#include "dataset/kernels/tensor_op.h"
|
||||
#include "dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
enum class NormalizeForm {
|
||||
kNone = 0,
|
||||
kNfc,
|
||||
kNfkc,
|
||||
kNfd,
|
||||
kNfkd,
|
||||
};
|
||||
|
||||
class NormalizeUTF8Op : public TensorOp {
|
||||
public:
|
||||
static const NormalizeForm kDefNormalizeForm;
|
||||
explicit NormalizeUTF8Op(NormalizeForm normalize_form = kDefNormalizeForm) : normalize_form_(normalize_form) {}
|
||||
|
||||
~NormalizeUTF8Op() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "NormalizeUTF8Op"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
|
||||
private:
|
||||
NormalizeForm normalize_form_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // DATASET_TEXT_KERNELS_NORMALIZE_UTF8_OP_H_
|
|
@ -0,0 +1,57 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "dataset/text/kernels/regex_replace_op.h"
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
Status RegexReplaceOp::RegexReplace(icu::RegexMatcher *const matcher, const std::string_view &text,
|
||||
std::string *out) const {
|
||||
CHECK_FAIL_RETURN_UNEXPECTED((matcher != nullptr && out != nullptr), "Input is null");
|
||||
UErrorCode icu_error = U_ZERO_ERROR;
|
||||
icu::UnicodeString unicode_text = icu::UnicodeString::fromUTF8(text);
|
||||
matcher->reset(unicode_text);
|
||||
icu::UnicodeString unicode_out;
|
||||
if (replace_all_) {
|
||||
unicode_out = matcher->replaceAll(replace_, icu_error);
|
||||
} else {
|
||||
unicode_out = matcher->replaceFirst(replace_, icu_error);
|
||||
}
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "RegexReplace failed");
|
||||
unicode_out.toUTF8String(*out);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status RegexReplaceOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
UErrorCode icu_error = U_ZERO_ERROR;
|
||||
icu::RegexMatcher matcher(pattern_, 0, icu_error);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "Create icu RegexMatcher failed, you may input one error pattern");
|
||||
std::vector<std::string> strs(input->Size());
|
||||
int i = 0;
|
||||
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
|
||||
RETURN_IF_NOT_OK(RegexReplace(&matcher, *iter, &strs[i]));
|
||||
}
|
||||
*output = std::make_shared<Tensor>(std::move(strs), input->shape());
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,55 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_
|
||||
#define DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "unicode/regex.h"
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#include "dataset/core/tensor.h"
|
||||
#include "dataset/kernels/tensor_op.h"
|
||||
#include "dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
class RegexReplaceOp : public TensorOp {
|
||||
public:
|
||||
RegexReplaceOp(const std::string &pattern, const std::string &replace, bool replace_all = true)
|
||||
: pattern_(icu::UnicodeString::fromUTF8(pattern)),
|
||||
replace_(icu::UnicodeString::fromUTF8(replace)),
|
||||
replace_all_(replace_all) {}
|
||||
|
||||
~RegexReplaceOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "RegexReplaceOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
|
||||
protected:
|
||||
Status RegexReplace(icu::RegexMatcher *const matcher, const std::string_view &text, std::string *out) const;
|
||||
|
||||
private:
|
||||
const icu::UnicodeString pattern_;
|
||||
const icu::UnicodeString replace_;
|
||||
const bool replace_all_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // DATASET_TEXT_KERNELS_REGEX_REPLACE_OP_H_
|
|
@ -0,0 +1,103 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "dataset/text/kernels/regex_tokenizer_op.h"
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8,
|
||||
icu::UnicodeString *out_unicode) const {
|
||||
CHECK_FAIL_RETURN_UNEXPECTED((out_utf8 != nullptr || out_unicode != nullptr), "Wrong input");
|
||||
int total_len = input.length();
|
||||
int end = start + len;
|
||||
CHECK_FAIL_RETURN_UNEXPECTED((start >= 0 && len > 0 && end <= total_len), "Out of range");
|
||||
icu::UnicodeString temp;
|
||||
input.extract(start, len, temp);
|
||||
if (out_utf8 != nullptr) {
|
||||
temp.toUTF8String(*out_utf8);
|
||||
}
|
||||
if (out_unicode != nullptr) {
|
||||
*out_unicode = temp;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
out_tokens->clear();
|
||||
icu::RegexMatcher token_matcher(delim_pattern_, 0, status);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Create icu RegexMatcher failed, you may input one error pattern");
|
||||
icu::RegexMatcher delim_matcher(keep_delim_pattern_, 0, status);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Create icu RegexMatcher failed, you may input one error pattern");
|
||||
|
||||
icu::UnicodeString utext(icu::UnicodeString::fromUTF8(text));
|
||||
token_matcher.reset(utext);
|
||||
|
||||
int token_start_index = 0;
|
||||
status = U_ZERO_ERROR;
|
||||
while (token_matcher.find(status) && U_SUCCESS(status)) {
|
||||
int deli_start_index = token_matcher.start(status);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Get RegexMatcher matched start index failed");
|
||||
int deli_end_index = token_matcher.end(status);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(status), "Get RegexMatcher matched start index failed");
|
||||
|
||||
// Add non-empty token
|
||||
int token_len = deli_start_index - token_start_index;
|
||||
if (token_len > 0) {
|
||||
std::string token;
|
||||
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, token_len, &token));
|
||||
out_tokens->emplace_back(std::move(token));
|
||||
}
|
||||
|
||||
int delim_len = deli_end_index - deli_start_index;
|
||||
if (keep_delim_ && delim_len > 0) {
|
||||
icu::UnicodeString delim_str;
|
||||
std::string delim_utf8_str;
|
||||
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, deli_start_index, delim_len, &delim_utf8_str, &delim_str));
|
||||
delim_matcher.reset(delim_str);
|
||||
if (delim_matcher.matches(status) && U_SUCCESS(status)) {
|
||||
out_tokens->emplace_back(std::move(delim_utf8_str));
|
||||
}
|
||||
}
|
||||
token_start_index = deli_end_index;
|
||||
}
|
||||
|
||||
if (token_start_index < utext.length()) {
|
||||
std::string temp;
|
||||
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, utext.length() - token_start_index, &temp));
|
||||
out_tokens->emplace_back(std::move(temp));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status RegexTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
|
||||
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
|
||||
}
|
||||
std::string_view text;
|
||||
RETURN_IF_NOT_OK(input->GetItemAt(&text, {}));
|
||||
std::vector<std::string> tokens;
|
||||
RETURN_IF_NOT_OK(GetRegexTokens(std::string(text.data(), text.size()), &tokens));
|
||||
*output = std::make_shared<Tensor>(std::move(tokens), TensorShape({(dsize_t)tokens.size()}));
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,58 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef DATASET_TEXT_REGEX_TOKENIZER_OP_H_
|
||||
#define DATASET_TEXT_REGEX_TOKENIZER_OP_H_
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "unicode/regex.h"
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#include "dataset/core/tensor.h"
|
||||
#include "dataset/kernels/tensor_op.h"
|
||||
#include "dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
class RegexTokenizerOp : public TensorOp {
|
||||
public:
|
||||
RegexTokenizerOp(const std::string &delim_pattern, const std::string &keep_delim_pattern)
|
||||
: delim_pattern_(icu::UnicodeString::fromUTF8(delim_pattern)),
|
||||
keep_delim_pattern_(icu::UnicodeString::fromUTF8(keep_delim_pattern)),
|
||||
keep_delim_(!keep_delim_pattern.empty()) {}
|
||||
|
||||
~RegexTokenizerOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "RegexTokenizerOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
|
||||
protected:
|
||||
Status GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8,
|
||||
icu::UnicodeString *out_unicode = nullptr) const;
|
||||
Status GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const;
|
||||
|
||||
private:
|
||||
const icu::UnicodeString delim_pattern_;
|
||||
const icu::UnicodeString keep_delim_pattern_;
|
||||
const bool keep_delim_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // DATASET_TEXT_REGEX_TOKENIZER_OP_H_
|
|
@ -28,6 +28,7 @@ namespace mindspore {
|
|||
namespace dataset {
|
||||
|
||||
Status UnicodeCharTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
|
||||
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
|
||||
}
|
||||
|
|
|
@ -13,8 +13,8 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_
|
||||
#define DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_
|
||||
#ifndef DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_
|
||||
#define DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_
|
||||
#include <memory>
|
||||
|
||||
#include "dataset/core/tensor.h"
|
||||
|
@ -37,4 +37,4 @@ class UnicodeCharTokenizerOp : public TensorOp {
|
|||
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // DATASET_KERNELS_TEXT_UNICODE_CHAR_TOKENIZER_OP_H_
|
||||
#endif // DATASET_TEXT_KERNELS_UNICODE_CHAR_TOKENIZER_OP_H_
|
||||
|
|
|
@ -0,0 +1,93 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "dataset/text/kernels/unicode_script_tokenizer_op.h"
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "cppjieba/Unicode.hpp"
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uscript.h"
|
||||
|
||||
using cppjieba::DecodeRunesInString;
|
||||
using cppjieba::RuneStrArray;
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
const bool UnicodeScriptTokenizerOp::kDefKeepWhitespace = false;
|
||||
|
||||
Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
|
||||
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
|
||||
}
|
||||
std::string_view str;
|
||||
RETURN_IF_NOT_OK(input->GetItemAt(&str, {}));
|
||||
RuneStrArray runes;
|
||||
if (!DecodeRunesInString(str.data(), str.size(), runes)) {
|
||||
RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
|
||||
}
|
||||
|
||||
UScriptCode last_script = USCRIPT_INVALID_CODE;
|
||||
icu::ErrorCode status;
|
||||
int start = 0;
|
||||
int len = 0;
|
||||
std::vector<std::string> splits;
|
||||
|
||||
bool was_space = false;
|
||||
for (size_t i = 0; i < runes.size(); i++) {
|
||||
bool is_space = u_isUWhiteSpace(runes[i].rune);
|
||||
UScriptCode script = uscript_getScript(runes[i].rune, status);
|
||||
if (status.isFailure()) {
|
||||
status.reset();
|
||||
script = USCRIPT_INVALID_CODE;
|
||||
}
|
||||
// 1) Seperate UTF-8 strings of different UScriptCode values
|
||||
// (such as: "Chinese中国" should be splited to ["Chinese", "中国"])
|
||||
// 2) Seperate whitespace and non-whitespace UTF-8 strings
|
||||
// (such as: " ." should be split to [" ", "."])
|
||||
if (len > 0 && (script != last_script || is_space != was_space)) {
|
||||
// 3) If keep_whitespace_ is false, all the whitespace characters will be discard
|
||||
if (keep_whitespace_ || !was_space) {
|
||||
std::string temp(str.substr(start, len));
|
||||
splits.emplace_back(std::move(temp));
|
||||
}
|
||||
start = runes[i].offset;
|
||||
len = runes[i].len;
|
||||
} else {
|
||||
len += runes[i].len;
|
||||
}
|
||||
last_script = script;
|
||||
was_space = is_space;
|
||||
}
|
||||
|
||||
if (len > 0 && (keep_whitespace_ || !was_space)) {
|
||||
std::string temp(str.substr(start, len));
|
||||
splits.emplace_back(std::move(temp));
|
||||
}
|
||||
// 4) If the input is empty scalar string, the output will be 1-D empty string.
|
||||
if (splits.empty()) {
|
||||
splits.emplace_back("");
|
||||
}
|
||||
*output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,44 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef DATASET_TEXT_KERNELS_UNICODE_SCRIPT_TOKENIZER_OP_H_
|
||||
#define DATASET_TEXT_KERNELS_UNICODE_SCRIPT_TOKENIZER_OP_H_
|
||||
#include <memory>
|
||||
|
||||
#include "dataset/core/tensor.h"
|
||||
#include "dataset/kernels/tensor_op.h"
|
||||
#include "dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
class UnicodeScriptTokenizerOp : public TensorOp {
|
||||
public:
|
||||
static const bool kDefKeepWhitespace;
|
||||
|
||||
explicit UnicodeScriptTokenizerOp(bool keep_whitespace = kDefKeepWhitespace) : keep_whitespace_(keep_whitespace) {}
|
||||
|
||||
~UnicodeScriptTokenizerOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "UnicodeScriptTokenizerOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
|
||||
private:
|
||||
bool keep_whitespace_; // If or not keep whitespace tokens
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // DATASET_TEXT_KERNELS_UNICODE_SCRIPT_TOKENIZER_OP_H_
|
|
@ -0,0 +1,73 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "dataset/text/kernels/whitespace_tokenizer_op.h"
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "cppjieba/Unicode.hpp"
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uscript.h"
|
||||
|
||||
using cppjieba::DecodeRunesInString;
|
||||
using cppjieba::RuneStrArray;
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
Status WhitespaceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
|
||||
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
|
||||
}
|
||||
std::string_view str;
|
||||
RETURN_IF_NOT_OK(input->GetItemAt(&str, {}));
|
||||
|
||||
RuneStrArray runes;
|
||||
if (!DecodeRunesInString(str.data(), str.size(), runes)) {
|
||||
RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
|
||||
}
|
||||
std::vector<std::string> splits;
|
||||
int start = 0;
|
||||
int len = 0;
|
||||
for (size_t i = 0; i < runes.size(); i++) {
|
||||
if (u_isUWhiteSpace(runes[i].rune)) {
|
||||
if (len > 0) {
|
||||
std::string temp(str.substr(start, len));
|
||||
splits.emplace_back(std::move(temp));
|
||||
len = 0;
|
||||
}
|
||||
} else {
|
||||
if (len == 0) {
|
||||
start = runes[i].offset;
|
||||
}
|
||||
len += runes[i].len;
|
||||
}
|
||||
}
|
||||
if (len > 0) {
|
||||
std::string temp(str.substr(start, len));
|
||||
splits.emplace_back(std::move(temp));
|
||||
}
|
||||
if (splits.empty()) {
|
||||
splits.emplace_back("");
|
||||
}
|
||||
*output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,39 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef DATASET_TEXT_KERNELS_WHITESPACE_TOKENIZER_OP_H_
|
||||
#define DATASET_TEXT_KERNELS_WHITESPACE_TOKENIZER_OP_H_
|
||||
#include <memory>
|
||||
|
||||
#include "dataset/core/tensor.h"
|
||||
#include "dataset/kernels/tensor_op.h"
|
||||
#include "dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
class WhitespaceTokenizerOp : public TensorOp {
|
||||
public:
|
||||
WhitespaceTokenizerOp() {}
|
||||
|
||||
~WhitespaceTokenizerOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "WhitespaceTokenizerOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // DATASET_TEXT_KERNELS_WHITESPACE_TOKENIZER_OP_H_
|
|
@ -0,0 +1,138 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dataset/text/kernels/wordpiece_tokenizer_op.h"
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
const char WordpieceTokenizerOp::kDefSuffixIndicator[] = "##";
|
||||
const int WordpieceTokenizerOp::kDefMaxBytesPerToken = 100;
|
||||
const char WordpieceTokenizerOp::kDefUnknownToken[] = "[UNK]";
|
||||
|
||||
WordpieceTokenizerOp::WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
|
||||
const int &max_bytes_per_token, const std::string &unknown_token)
|
||||
: vocab_(vocab),
|
||||
suffix_indicator_(suffix_indicator),
|
||||
max_bytes_per_token_(max_bytes_per_token),
|
||||
unknown_token_(unknown_token) {}
|
||||
|
||||
void WordpieceTokenizerOp::PadTokens(const std::vector<std::vector<std::string>> &tokens, const std::string &padded_str,
|
||||
std::vector<std::string> *out_padded_tokens, int *out_cols) const {
|
||||
int rows = tokens.size();
|
||||
int max_cols = 0;
|
||||
for (int i = 0; i < rows; i++) {
|
||||
max_cols = std::max(max_cols, static_cast<int>(tokens[i].size()));
|
||||
}
|
||||
out_padded_tokens->resize(rows * max_cols, padded_str);
|
||||
for (int i = 0; i < rows; i++) {
|
||||
int index = i * max_cols;
|
||||
for (int j = 0; j < tokens[i].size(); j++) {
|
||||
(*out_padded_tokens)[index++] = tokens[i][j];
|
||||
}
|
||||
}
|
||||
*out_cols = max_cols;
|
||||
}
|
||||
|
||||
Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start,
|
||||
bool *out_found, int *out_end) const {
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && start < input_token.size(), "Out of range");
|
||||
*out_found = false;
|
||||
for (int i = runes.size() - 1; i >= 0; i--) {
|
||||
*out_end = runes[i].offset + runes[i].len;
|
||||
int len = *out_end - start;
|
||||
std::string word = input_token.substr(start, len);
|
||||
if (start > 0) {
|
||||
word = suffix_indicator_ + word;
|
||||
}
|
||||
WordIdType default_id = -1;
|
||||
if (vocab_->Lookup(word, default_id) != default_id) {
|
||||
*out_found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status WordpieceTokenizerOp::FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const {
|
||||
out_tokens->clear();
|
||||
if (unknown_token_.empty()) {
|
||||
out_tokens->emplace_back(input_token);
|
||||
} else {
|
||||
out_tokens->emplace_back(unknown_token_);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const int start, const int end,
|
||||
std::vector<std::string> *out_tokens) const {
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && end > start && end <= input_token.size(), "Out of range");
|
||||
std::string subword = input_token.substr(start, end - start);
|
||||
if (start > 0) {
|
||||
subword = suffix_indicator_ + subword;
|
||||
}
|
||||
out_tokens->emplace_back(subword);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vector<std::string> *out_tokens) const {
|
||||
if (input_token.size() > max_bytes_per_token_) {
|
||||
return FoundNoToken(input_token, out_tokens);
|
||||
}
|
||||
RuneStrArray runes;
|
||||
if (!DecodeRunesInString(input_token.data(), input_token.size(), runes)) {
|
||||
RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
|
||||
}
|
||||
int end;
|
||||
for (int start = 0; start < input_token.size();) {
|
||||
bool found;
|
||||
RETURN_IF_NOT_OK(LookupWord(input_token, runes, start, &found, &end));
|
||||
if (found) {
|
||||
RETURN_IF_NOT_OK(AddSubword(input_token, start, end, out_tokens));
|
||||
start = end;
|
||||
} else {
|
||||
return FoundNoToken(input_token, out_tokens);
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status WordpieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
if (input->Rank() > 1 || input->type() != DataType::DE_STRING) {
|
||||
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar or 1-D string tensor");
|
||||
}
|
||||
std::vector<std::vector<std::string>> out_tokens(input->Size());
|
||||
int i = 0;
|
||||
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
|
||||
RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &out_tokens[i++]));
|
||||
}
|
||||
std::vector<std::string> padded_tokens;
|
||||
int cols = 0;
|
||||
PadTokens(out_tokens, "<pad>", &padded_tokens, &cols);
|
||||
std::vector<dsize_t> shapes;
|
||||
if (input->Rank() == 1) {
|
||||
shapes.push_back(out_tokens.size());
|
||||
}
|
||||
shapes.push_back(cols);
|
||||
*output = std::make_shared<Tensor>(std::move(padded_tokens), TensorShape(shapes));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,68 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef DATASET_TEXT_KERNELS_WORDPIECE_TOKENIZER_OP_H_
|
||||
#define DATASET_TEXT_KERNELS_WORDPIECE_TOKENIZER_OP_H_
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
#include "cppjieba/Unicode.hpp"
|
||||
|
||||
#include "dataset/core/tensor.h"
|
||||
#include "dataset/kernels/tensor_op.h"
|
||||
#include "dataset/text/vocab.h"
|
||||
#include "dataset/util/status.h"
|
||||
|
||||
using cppjieba::DecodeRunesInString;
|
||||
using cppjieba::RuneStrArray;
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
class WordpieceTokenizerOp : public TensorOp {
|
||||
public:
|
||||
static const char kDefSuffixIndicator[];
|
||||
static const int kDefMaxBytesPerToken;
|
||||
static const char kDefUnknownToken[];
|
||||
WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = kDefSuffixIndicator,
|
||||
const int &max_bytes_per_token = kDefMaxBytesPerToken,
|
||||
const std::string &unknown_token = kDefUnknownToken);
|
||||
|
||||
~WordpieceTokenizerOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "WordpieceTokenizerOp"; }
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
|
||||
protected:
|
||||
void PadTokens(const std::vector<std::vector<std::string>> &tokens, const std::string &padded_str,
|
||||
std::vector<std::string> *out_padded_tokens, int *out_cols) const;
|
||||
Status AddSubword(const std::string &input_token, const int start, const int end,
|
||||
std::vector<std::string> *out_token) const;
|
||||
Status FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const;
|
||||
Status LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start, bool *out_found,
|
||||
int *out_end) const;
|
||||
Status GetTokens(const std::string &input_token, std::vector<std::string> *out_tokens) const;
|
||||
|
||||
private:
|
||||
const std::shared_ptr<Vocab> vocab_;
|
||||
const std::string suffix_indicator_;
|
||||
const int max_bytes_per_token_;
|
||||
const std::string unknown_token_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // DATASET_TEXT_KERNELS_WORDPIECE_TOKENIZER_OP_H_
|
|
@ -15,5 +15,18 @@
|
|||
"""
|
||||
mindspore.dataset.text
|
||||
"""
|
||||
from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram
|
||||
from .utils import to_str, to_bytes, JiebaMode, Vocab
|
||||
import platform
|
||||
from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer
|
||||
from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm
|
||||
|
||||
__all__ = [
|
||||
"Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram",
|
||||
"to_str", "to_bytes", "JiebaMode", "Vocab", "WordpieceTokenizer"
|
||||
]
|
||||
|
||||
if platform.system().lower() != 'windows':
|
||||
from .transforms import UnicodeScriptTokenizer, WhitespaceTokenizer, CaseFold, NormalizeUTF8, \
|
||||
RegexReplace, RegexTokenizer, BasicTokenizer, BertTokenizer
|
||||
|
||||
__all__.append(["UnicodeScriptTokenizer", "WhitespaceTokenizer", "CaseFold", "NormalizeUTF8",
|
||||
"RegexReplace", "RegexTokenizer", "BasicTokenizer", "BertTokenizer", "NormalizeForm"])
|
||||
|
|
|
@ -17,10 +17,11 @@ c transforms for all text related operators
|
|||
|
||||
import os
|
||||
import re
|
||||
import platform
|
||||
|
||||
import mindspore._c_dataengine as cde
|
||||
|
||||
from .utils import JiebaMode
|
||||
from .utils import JiebaMode, NormalizeForm
|
||||
from .validators import check_lookup, check_jieba_add_dict, \
|
||||
check_jieba_add_word, check_jieba_init, check_ngram
|
||||
|
||||
|
@ -174,3 +175,172 @@ class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp):
|
|||
"""
|
||||
Tokenize a scalar tensor of UTF-8 string to Unicode characters.
|
||||
"""
|
||||
|
||||
|
||||
class WordpieceTokenizer(cde.WordpieceTokenizerOp):
|
||||
"""
|
||||
Tokenize scalar token or 1-D tokens to subword tokens.
|
||||
|
||||
Args
|
||||
vocab(Vocab): a Vocab object.
|
||||
suffix_indicator(string, optional): Used to show that the subword is the last part of a word(default '##').
|
||||
max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default 100).
|
||||
unknown_token(string, optional): When we can not found the token: if 'unknown_token' is empty string,
|
||||
return the token directly, else return 'unknown_token'(default '[UNK]').
|
||||
"""
|
||||
|
||||
def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]'):
|
||||
self.vocab = vocab
|
||||
self.suffix_indicator = suffix_indicator
|
||||
self.max_bytes_per_token = max_bytes_per_token
|
||||
self.unknown_token = unknown_token
|
||||
super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token)
|
||||
|
||||
|
||||
if platform.system().lower() != 'windows':
|
||||
class WhitespaceTokenizer(cde.WhitespaceTokenizerOp):
|
||||
"""
|
||||
Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\t', '\r', '\n').
|
||||
"""
|
||||
|
||||
|
||||
class UnicodeScriptTokenizer(cde.UnicodeScriptTokenizerOp):
|
||||
"""
|
||||
Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
|
||||
|
||||
Args:
|
||||
keep_whitespace(bool, optional): If or not emit whitespace tokens (default False)
|
||||
"""
|
||||
|
||||
def __init__(self, keep_whitespace=False):
|
||||
self.keep_whitespace = keep_whitespace
|
||||
super().__init__(self.keep_whitespace)
|
||||
|
||||
|
||||
class CaseFold(cde.CaseFoldOp):
|
||||
"""
|
||||
Apply case fold operation on utf-8 string tensor.
|
||||
"""
|
||||
|
||||
|
||||
DE_C_INTER_NORMALIZE_FORM = {
|
||||
NormalizeForm.NONE: cde.NormalizeForm.DE_NORMALIZE_NONE,
|
||||
NormalizeForm.NFC: cde.NormalizeForm.DE_NORMALIZE_NFC,
|
||||
NormalizeForm.NFKC: cde.NormalizeForm.DE_NORMALIZE_NFKC,
|
||||
NormalizeForm.NFD: cde.NormalizeForm.DE_NORMALIZE_NFD,
|
||||
NormalizeForm.NFKD: cde.NormalizeForm.DE_NORMALIZE_NFKD
|
||||
}
|
||||
|
||||
|
||||
class NormalizeUTF8(cde.NormalizeUTF8Op):
|
||||
"""
|
||||
Apply normalize operation on utf-8 string tensor.
|
||||
|
||||
Args:
|
||||
normalize_form(Enum, optional): Valid values are "NONE", "NFC", "NFKC", "NFD", "NFKD".
|
||||
If set "NONE", will do nothing for input string tensor.
|
||||
If set to any of "NFC", "NFKC", "NFD", "NFKD", will apply normalize operation(default "NFKC").
|
||||
See http://unicode.org/reports/tr15/ for details.
|
||||
"""
|
||||
|
||||
def __init__(self, normalize_form=NormalizeForm.NFKC):
|
||||
self.normalize_form = DE_C_INTER_NORMALIZE_FORM[normalize_form]
|
||||
super().__init__(self.normalize_form)
|
||||
|
||||
|
||||
class RegexReplace(cde.RegexReplaceOp):
|
||||
"""
|
||||
Replace utf-8 string tensor with 'replace' according to regular expression 'pattern'.
|
||||
See http://userguide.icu-project.org/strings/regexp for support regex pattern.
|
||||
|
||||
Args:
|
||||
pattern(string): the regex expression patterns.
|
||||
replace(string): the string to replace matched element.
|
||||
replace_all(bool, optional): If False, only replace first matched element;
|
||||
if True, replace all matched elements(default True).
|
||||
"""
|
||||
|
||||
def __init__(self, pattern, replace, replace_all=True):
|
||||
self.pattern = pattern
|
||||
self.replace = replace
|
||||
self.replace_all = replace_all
|
||||
super().__init__(self.pattern, self.replace, self.replace_all)
|
||||
|
||||
|
||||
class RegexTokenizer(cde.RegexTokenizerOp):
|
||||
"""
|
||||
Tokenize a scalar tensor of UTF-8 string by regex expression pattern.
|
||||
See http://userguide.icu-project.org/strings/regexp for support regex pattern.
|
||||
|
||||
Args:
|
||||
delim_pattern(string): The pattern of regex delimiters.
|
||||
The original string will be split by matched elements.
|
||||
keep_delim_pattern(string, optional): The string matched by 'delim_pattern' can be kept as a token
|
||||
if it can be matched by 'keep_delim_pattern'. And the default value is empty string(''),
|
||||
in this situation, delimiters will not kept as a output token.
|
||||
"""
|
||||
|
||||
def __init__(self, delim_pattern, keep_delim_pattern=''):
|
||||
self.delim_pattern = delim_pattern
|
||||
self.keep_delim_pattern = keep_delim_pattern
|
||||
super().__init__(self.delim_pattern, self.keep_delim_pattern)
|
||||
|
||||
|
||||
class BasicTokenizer(cde.BasicTokenizerOp):
|
||||
"""
|
||||
Tokenize a scalar tensor of UTF-8 string by specific rules.
|
||||
|
||||
Args:
|
||||
lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
|
||||
on input text to make the text to lower case and strip accents characters; If False, only apply
|
||||
NormalizeUTF8('normalization_form' mode) operation on input text(default False).
|
||||
keep_whitespace(bool, optional), If True, the whitespace will be kept in out tokens(default False).
|
||||
normalization_form(Enum, optional), Used to specify a specific normlaize mode,
|
||||
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default 'NONE').
|
||||
preserve_unused_token(bool, optional), If True, do not split special tokens like
|
||||
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default True).
|
||||
"""
|
||||
|
||||
def __init__(self, lower_case=False, keep_whitespace=False,
|
||||
normalization_form=NormalizeForm.NONE, preserve_unused_token=True):
|
||||
self.lower_case = lower_case
|
||||
self.keep_whitespace = keep_whitespace
|
||||
self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
|
||||
self.preserve_unused_token = preserve_unused_token
|
||||
super().__init__(self.lower_case, self.keep_whitespace,
|
||||
self.normalization_form, self.preserve_unused_token)
|
||||
|
||||
|
||||
class BertTokenizer(cde.BertTokenizerOp):
|
||||
"""
|
||||
Tokenizer used for Bert text process.
|
||||
|
||||
Args:
|
||||
vocab(Vocab): a Vocab object.
|
||||
suffix_indicator(string, optional): Used to show that the subword is the last part of a word(default '##').
|
||||
max_bytes_per_token(int, optional): Tokens exceeding this length will not be further split(default 100).
|
||||
unknown_token(string, optional): When we can not found the token: if 'unknown_token' is empty string,
|
||||
return the token directly, else return 'unknown_token'(default '[UNK]').
|
||||
lower_case(bool, optional): If True, apply CaseFold, NormalizeUTF8(NFD mode), RegexReplace operation
|
||||
on input text to make the text to lower case and strip accents characters; If False, only apply
|
||||
NormalizeUTF8('normalization_form' mode) operation on input text(default False).
|
||||
keep_whitespace(bool, optional), If True, the whitespace will be kept in out tokens(default False).
|
||||
normalization_form(Enum, optional), Used to specify a specific normlaize mode,
|
||||
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default 'NONE').
|
||||
preserve_unused_token(bool, optional), If True, do not split special tokens like
|
||||
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default True).
|
||||
"""
|
||||
|
||||
def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100,
|
||||
unknown_token='[UNK]', lower_case=False, keep_whitespace=False,
|
||||
normalization_form=NormalizeForm.NONE, preserve_unused_token=True):
|
||||
self.vocab = vocab
|
||||
self.suffix_indicator = suffix_indicator
|
||||
self.max_bytes_per_token = max_bytes_per_token
|
||||
self.unknown_token = unknown_token
|
||||
self.lower_case = lower_case
|
||||
self.keep_whitespace = keep_whitespace
|
||||
self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
|
||||
self.preserve_unused_token = preserve_unused_token
|
||||
super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token,
|
||||
self.lower_case, self.keep_whitespace, self.normalization_form, self.preserve_unused_token)
|
||||
|
|
|
@ -127,3 +127,11 @@ class JiebaMode(IntEnum):
|
|||
MIX = 0
|
||||
MP = 1
|
||||
HMM = 2
|
||||
|
||||
|
||||
class NormalizeForm(IntEnum):
|
||||
NONE = 0
|
||||
NFC = 1
|
||||
NFKC = 2
|
||||
NFD = 3
|
||||
NFKD = 4
|
||||
|
|
|
@ -18,7 +18,14 @@
|
|||
#include <string_view>
|
||||
|
||||
#include "common/common.h"
|
||||
#include "dataset/text/kernels/basic_tokenizer_op.h"
|
||||
#include "dataset/text/kernels/case_fold_op.h"
|
||||
#include "dataset/text/kernels/normalize_utf8_op.h"
|
||||
#include "dataset/text/kernels/regex_replace_op.h"
|
||||
#include "dataset/text/kernels/regex_tokenizer_op.h"
|
||||
#include "dataset/text/kernels/unicode_char_tokenizer_op.h"
|
||||
#include "dataset/text/kernels/unicode_script_tokenizer_op.h"
|
||||
#include "dataset/text/kernels/whitespace_tokenizer_op.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "utils/log_adapter.h"
|
||||
|
||||
|
@ -105,3 +112,229 @@ TEST_F(MindDataTestTokenizerOp, TestUnicodeCharTokenizerOp) {
|
|||
MS_LOG(INFO) << "Out tensor6: " << output->ToString();
|
||||
CheckEqual(output, {0}, "");
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestTokenizerOp, TestWhitespaceTokenizerOp) {
|
||||
MS_LOG(INFO) << "Doing TestWhitespaceTokenizerOp.";
|
||||
std::unique_ptr<WhitespaceTokenizerOp> op(new WhitespaceTokenizerOp());
|
||||
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China.");
|
||||
std::shared_ptr<Tensor> output;
|
||||
Status s = op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 3);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor1: " << output->ToString();
|
||||
CheckEqual(output, {0}, "Welcome");
|
||||
CheckEqual(output, {1}, "to");
|
||||
CheckEqual(output, {2}, "China.");
|
||||
|
||||
input = std::make_shared<Tensor>(" hello");
|
||||
s = op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor2: " << output->ToString();
|
||||
CheckEqual(output, {0}, "hello");
|
||||
|
||||
input = std::make_shared<Tensor>("hello");
|
||||
s = op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor3: " << output->ToString();
|
||||
CheckEqual(output, {0}, "hello");
|
||||
|
||||
input = std::make_shared<Tensor>("hello ");
|
||||
s = op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor4: " << output->ToString();
|
||||
CheckEqual(output, {0}, "hello");
|
||||
|
||||
input = std::make_shared<Tensor>(" ");
|
||||
s = op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor5: " << output->ToString();
|
||||
CheckEqual(output, {0}, "");
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestTokenizerOp, TestUnicodeScriptTokenizer) {
|
||||
MS_LOG(INFO) << "Doing TestUnicodeScriptTokenizer.";
|
||||
std::unique_ptr<UnicodeScriptTokenizerOp> keep_whitespace_op(new UnicodeScriptTokenizerOp(true));
|
||||
std::unique_ptr<UnicodeScriptTokenizerOp> skip_whitespace_op(new UnicodeScriptTokenizerOp(false));
|
||||
|
||||
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
|
||||
std::shared_ptr<Tensor> output;
|
||||
Status s = keep_whitespace_op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 10);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor1: " << output->ToString();
|
||||
CheckEqual(output, {0}, "Welcome");
|
||||
CheckEqual(output, {1}, " ");
|
||||
CheckEqual(output, {2}, "to");
|
||||
CheckEqual(output, {3}, " ");
|
||||
CheckEqual(output, {4}, "China");
|
||||
CheckEqual(output, {5}, ".");
|
||||
CheckEqual(output, {6}, " \n ");
|
||||
CheckEqual(output, {7}, "中国");
|
||||
CheckEqual(output, {8}, "\t");
|
||||
CheckEqual(output, {9}, "北京");
|
||||
s = skip_whitespace_op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 6);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor2: " << output->ToString();
|
||||
CheckEqual(output, {0}, "Welcome");
|
||||
CheckEqual(output, {1}, "to");
|
||||
CheckEqual(output, {2}, "China");
|
||||
CheckEqual(output, {3}, ".");
|
||||
CheckEqual(output, {4}, "中国");
|
||||
CheckEqual(output, {5}, "北京");
|
||||
|
||||
input = std::make_shared<Tensor>(" Welcome to 中国. ");
|
||||
s = skip_whitespace_op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 4);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor3: " << output->ToString();
|
||||
CheckEqual(output, {0}, "Welcome");
|
||||
CheckEqual(output, {1}, "to");
|
||||
CheckEqual(output, {2}, "中国");
|
||||
CheckEqual(output, {3}, ".");
|
||||
s = keep_whitespace_op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 8);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor4: " << output->ToString();
|
||||
CheckEqual(output, {0}, " ");
|
||||
CheckEqual(output, {1}, "Welcome");
|
||||
CheckEqual(output, {2}, " ");
|
||||
CheckEqual(output, {3}, "to");
|
||||
CheckEqual(output, {4}, " ");
|
||||
CheckEqual(output, {5}, "中国");
|
||||
CheckEqual(output, {6}, ".");
|
||||
CheckEqual(output, {7}, " ");
|
||||
|
||||
input = std::make_shared<Tensor>("Hello");
|
||||
s = keep_whitespace_op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor5: " << output->ToString();
|
||||
CheckEqual(output, {0}, "Hello");
|
||||
|
||||
input = std::make_shared<Tensor>("H");
|
||||
s = keep_whitespace_op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor6: " << output->ToString();
|
||||
CheckEqual(output, {0}, "H");
|
||||
|
||||
input = std::make_shared<Tensor>("");
|
||||
s = keep_whitespace_op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor7: " << output->ToString();
|
||||
CheckEqual(output, {0}, "");
|
||||
|
||||
input = std::make_shared<Tensor>("Hello中国Hello世界");
|
||||
s = keep_whitespace_op->Compute(input, &output); EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 4);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor8: " << output->ToString();
|
||||
CheckEqual(output, {0}, "Hello");
|
||||
CheckEqual(output, {1}, "中国");
|
||||
CheckEqual(output, {2}, "Hello");
|
||||
CheckEqual(output, {3}, "世界");
|
||||
|
||||
input = std::make_shared<Tensor>(" ");
|
||||
s = keep_whitespace_op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor10: " << output->ToString();
|
||||
CheckEqual(output, {0}, " ");
|
||||
input = std::make_shared<Tensor>(" ");
|
||||
s = skip_whitespace_op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 1);
|
||||
MS_LOG(INFO) << "Out tensor11: " << output->ToString();
|
||||
CheckEqual(output, {0}, "");
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestTokenizerOp, TestCaseFold) {
|
||||
MS_LOG(INFO) << "Doing TestCaseFold.";
|
||||
std::unique_ptr<CaseFoldOp> case_fold_op(new CaseFoldOp());
|
||||
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
|
||||
std::shared_ptr<Tensor> output;
|
||||
Status s = case_fold_op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 0);
|
||||
MS_LOG(INFO) << "Out tensor1: " << output->ToString();
|
||||
CheckEqual(output, {}, "welcome to china. \n 中国\t北京");
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestTokenizerOp, TestNormalize) {
|
||||
MS_LOG(INFO) << "Doing TestNormalize.";
|
||||
std::unique_ptr<NormalizeUTF8Op> nfc_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfc));
|
||||
std::unique_ptr<NormalizeUTF8Op> nfkc_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfkc));
|
||||
std::unique_ptr<NormalizeUTF8Op> nfd_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfd));
|
||||
std::unique_ptr<NormalizeUTF8Op> nfkd_normalize_op(new NormalizeUTF8Op(NormalizeForm::kNfkd));
|
||||
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("ṩ");
|
||||
std::shared_ptr<Tensor> output;
|
||||
Status s = nfc_normalize_op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
MS_LOG(INFO) << "NFC str:" << output->ToString();
|
||||
|
||||
nfkc_normalize_op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
MS_LOG(INFO) << "NFKC str:" << output->ToString();
|
||||
|
||||
nfd_normalize_op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
MS_LOG(INFO) << "NFD str:" << output->ToString();
|
||||
|
||||
nfkd_normalize_op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
MS_LOG(INFO) << "NFKD str:" << output->ToString();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestTokenizerOp, TestRegexReplace) {
|
||||
MS_LOG(INFO) << "Doing TestRegexReplace.";
|
||||
std::unique_ptr<RegexReplaceOp> regex_replace_op(new RegexReplaceOp("\\s+", "_", true));
|
||||
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
|
||||
std::shared_ptr<Tensor> output;
|
||||
Status s = regex_replace_op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
EXPECT_EQ(output->Size(), 1);
|
||||
EXPECT_EQ(output->Rank(), 0);
|
||||
MS_LOG(INFO) << "Out tensor1: " << output->ToString();
|
||||
CheckEqual(output, {}, "Welcome_to_China._中国_北京");
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestTokenizerOp, TestRegexTokenizer) {
|
||||
MS_LOG(INFO) << "Doing TestRegexTokenizerOp.";
|
||||
std::unique_ptr<RegexTokenizerOp> regex_tokenizer_op(new RegexTokenizerOp("\\p{Cc}|\\p{Cf}|\\s+", ""));
|
||||
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
|
||||
std::shared_ptr<Tensor> output;
|
||||
Status s = regex_tokenizer_op->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestTokenizerOp, TestBasicTokenizer) {
|
||||
MS_LOG(INFO) << "Doing TestBasicTokenizer.";
|
||||
//bool lower_case, bool keep_whitespace,
|
||||
// NormalizeForm normalization_form, bool preserve_unused_token
|
||||
std::unique_ptr<BasicTokenizerOp> basic_tokenizer(new BasicTokenizerOp(true, true, NormalizeForm::kNone, false));
|
||||
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. 中国\t北京");
|
||||
std::shared_ptr<Tensor> output;
|
||||
Status s = basic_tokenizer->Compute(input, &output);
|
||||
EXPECT_TRUE(s.IsOk());
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
Welcome to Beijing北京欢迎您
|
||||
長風破浪會有時,直掛雲帆濟滄海
|
||||
😀嘿嘿😃哈哈😄大笑😁嘻嘻
|
||||
明朝(1368—1644年)和清朝(1644—1911年),是中国封建王朝史上最后两个朝代
|
||||
明代(1368-1644)と清代(1644-1911)は、中国の封建王朝の歴史における最後の2つの王朝でした
|
||||
명나라 (1368-1644)와 청나라 (1644-1911)는 중국 봉건 왕조의 역사에서 마지막 두 왕조였다
|
||||
Tĥïŝ ĩš â fůňķŷ Šťŕĭńġ
|
|
@ -0,0 +1,14 @@
|
|||
床前明月光
|
||||
疑是地上霜
|
||||
举头望明月
|
||||
低头思故乡
|
||||
I am making small mistakes during working hours
|
||||
😀嘿嘿😃哈哈😄大笑😁嘻嘻
|
||||
繁體字
|
||||
unused [CLS]
|
||||
unused [SEP]
|
||||
unused [UNK]
|
||||
unused [PAD]
|
||||
unused [MASK]
|
||||
12+/-28=40/-16
|
||||
Hello World!
|
|
@ -0,0 +1,6 @@
|
|||
ṩ
|
||||
ḍ̇
|
||||
q̣̇
|
||||
fi
|
||||
2⁵
|
||||
ẛ̣
|
|
@ -0,0 +1,8 @@
|
|||
Hello World
|
||||
Let's Go
|
||||
1:hello
|
||||
2:world
|
||||
31:beijing
|
||||
Welcome to China!
|
||||
我 不想 长大
|
||||
Welcome to Shenzhen!
|
|
@ -0,0 +1,3 @@
|
|||
Welcome to Shenzhen!
|
||||
北京欢迎您!Welcome to Beijing!
|
||||
12¥+36¥=?
|
|
@ -0,0 +1,25 @@
|
|||
my
|
||||
favorite
|
||||
book
|
||||
is
|
||||
love
|
||||
during
|
||||
the
|
||||
cholera
|
||||
era
|
||||
what
|
||||
我
|
||||
最
|
||||
喜
|
||||
欢
|
||||
的
|
||||
书
|
||||
是
|
||||
霍
|
||||
乱
|
||||
时
|
||||
期
|
||||
的
|
||||
爱
|
||||
情
|
||||
您
|
|
@ -0,0 +1,83 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""
|
||||
Testing BasicTokenizer op in DE
|
||||
"""
|
||||
import numpy as np
|
||||
import mindspore.dataset as ds
|
||||
from mindspore import log as logger
|
||||
import mindspore.dataset.text as nlp
|
||||
|
||||
BASIC_TOKENIZER_FILE = "../data/dataset/testTokenizerData/basic_tokenizer.txt"
|
||||
|
||||
test_paras = [
|
||||
dict(
|
||||
first=1,
|
||||
last=6,
|
||||
expected_tokens=
|
||||
[['Welcome', 'to', 'Beijing', '北', '京', '欢', '迎', '您'],
|
||||
['長', '風', '破', '浪', '會', '有', '時', ',', '直', '掛', '雲', '帆', '濟', '滄', '海'],
|
||||
['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'],
|
||||
['明', '朝', '(', '1368', '—', '1644', '年', ')', '和', '清', '朝',
|
||||
'(', '1644', '—', '1911', '年', ')', ',', '是', '中', '国', '封',
|
||||
'建', '王', '朝', '史', '上', '最', '后', '两', '个', '朝', '代'],
|
||||
['明', '代', '(', '1368', '-', '1644', ')', 'と', '清', '代',
|
||||
'(', '1644', '-', '1911', ')', 'は', '、', '中', '国', 'の', '封',
|
||||
'建', '王', '朝', 'の', '歴', '史', 'における', '最', '後', 'の2つの', '王', '朝', 'でした'],
|
||||
['명나라', '(', '1368', '-', '1644', ')', '와', '청나라', '(', '1644', '-', '1911', ')', '는',
|
||||
'중국', '봉건', '왕조의', '역사에서', '마지막', '두', '왕조였다']]
|
||||
),
|
||||
dict(
|
||||
first=7,
|
||||
last=7,
|
||||
expected_tokens=[['this', 'is', 'a', 'funky', 'string']],
|
||||
lower_case=True
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def check_basic_tokenizer(first, last, expected_tokens, lower_case=False, keep_whitespace=False,
|
||||
normalization_form=nlp.utils.NormalizeForm.NONE, preserve_unused_token=False):
|
||||
dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False)
|
||||
if first > 1:
|
||||
dataset = dataset.skip(first - 1)
|
||||
if last >= first:
|
||||
dataset = dataset.take(last - first + 1)
|
||||
|
||||
basic_tokenizer = nlp.BasicTokenizer(lower_case=lower_case,
|
||||
keep_whitespace=keep_whitespace,
|
||||
normalization_form=normalization_form,
|
||||
preserve_unused_token=preserve_unused_token)
|
||||
|
||||
dataset = dataset.map(operations=basic_tokenizer)
|
||||
count = 0
|
||||
for i in dataset.create_dict_iterator():
|
||||
text = nlp.to_str(i['text'])
|
||||
logger.info("Out:", text)
|
||||
logger.info("Exp:", expected_tokens[count])
|
||||
np.testing.assert_array_equal(text, expected_tokens[count])
|
||||
count = count + 1
|
||||
|
||||
|
||||
def test_basic_tokenizer():
|
||||
"""
|
||||
Test BasicTokenizer
|
||||
"""
|
||||
for paras in test_paras:
|
||||
check_basic_tokenizer(**paras)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_basic_tokenizer()
|
|
@ -0,0 +1,183 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""
|
||||
Testing BertTokenizer op in DE
|
||||
"""
|
||||
import numpy as np
|
||||
import mindspore.dataset as ds
|
||||
from mindspore import log as logger
|
||||
import mindspore.dataset.text as nlp
|
||||
|
||||
BERT_TOKENIZER_FILE = "../data/dataset/testTokenizerData/bert_tokenizer.txt"
|
||||
|
||||
vocab_bert = [
|
||||
"床", "前", "明", "月", "光", "疑", "是", "地", "上", "霜", "举", "头", "望", "低", "思", "故", "乡",
|
||||
"繁", "體", "字", "嘿", "哈", "大", "笑", "嘻",
|
||||
"i", "am", "mak", "make", "small", "mistake", "##s", "during", "work", "##ing", "hour",
|
||||
"😀", "😃", "😄", "😁", "+", "/", "-", "=", "12", "28", "40", "16", " ", "I",
|
||||
"[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]"
|
||||
]
|
||||
pad = '<pad>'
|
||||
test_paras = [
|
||||
# test chinese text
|
||||
dict(
|
||||
first=1,
|
||||
last=4,
|
||||
expect_str=[[['床'], ['前'], ['明'], ['月'], ['光']],
|
||||
[['疑'], ['是'], ['地'], ['上'], ['霜']],
|
||||
[['举'], ['头'], ['望'], ['明'], ['月']],
|
||||
[['低'], ['头'], ['思'], ['故'], ['乡']]],
|
||||
vocab_list=vocab_bert
|
||||
),
|
||||
# test english text
|
||||
dict(
|
||||
first=5,
|
||||
last=5,
|
||||
expect_str=[[['i', pad],
|
||||
["am", pad],
|
||||
['mak', '##ing'],
|
||||
['small', pad],
|
||||
['mistake', '##s'],
|
||||
['during', pad],
|
||||
['work', '##ing'],
|
||||
['hour', '##s']]],
|
||||
lower_case=True,
|
||||
vocab_list=vocab_bert
|
||||
),
|
||||
dict(
|
||||
first=5,
|
||||
last=5,
|
||||
expect_str=[[['I', pad],
|
||||
["am", pad],
|
||||
['mak', '##ing'],
|
||||
['small', pad],
|
||||
['mistake', '##s'],
|
||||
['during', pad],
|
||||
['work', '##ing'],
|
||||
['hour', '##s']]],
|
||||
lower_case=False,
|
||||
vocab_list=vocab_bert
|
||||
),
|
||||
# test emoji tokens
|
||||
dict(
|
||||
first=6,
|
||||
last=7,
|
||||
expect_str=[
|
||||
[['😀'], ['嘿'], ['嘿'], ['😃'], ['哈'], ['哈'], ['😄'], ['大'], ['笑'], ['😁'], ['嘻'], ['嘻']],
|
||||
[['繁'], ['體'], ['字']]],
|
||||
normalization_form=nlp.utils.NormalizeForm.NFKC,
|
||||
vocab_list=vocab_bert
|
||||
),
|
||||
# test preserved tokens
|
||||
dict(
|
||||
first=8,
|
||||
last=12,
|
||||
expect_str=[
|
||||
[['[UNK]'], ['[CLS]']],
|
||||
[['[UNK]'], ['[SEP]']],
|
||||
[['[UNK]'], ['[UNK]']],
|
||||
[['[UNK]'], ['[PAD]']],
|
||||
[['[UNK]'], ['[MASK]']],
|
||||
],
|
||||
lower_case=False,
|
||||
vocab_list=vocab_bert,
|
||||
preserve_unused_token=True,
|
||||
),
|
||||
# test special symbol
|
||||
dict(
|
||||
first=13,
|
||||
last=13,
|
||||
expect_str=[[['12'], ['+'], ['/'], ['-'], ['28'], ['='], ['40'], ['/'], ['-'], ['16']]],
|
||||
preserve_unused_token=True,
|
||||
vocab_list=vocab_bert
|
||||
),
|
||||
# test non-default parms
|
||||
dict(
|
||||
first=8,
|
||||
last=8,
|
||||
expect_str=[
|
||||
[['[UNK]'], [' '], ['[CLS]']],
|
||||
],
|
||||
lower_case=False,
|
||||
vocab_list=vocab_bert,
|
||||
preserve_unused_token=True,
|
||||
keep_whitespace=True
|
||||
),
|
||||
dict(
|
||||
first=8,
|
||||
last=8,
|
||||
expect_str=[
|
||||
[['unused'], [' '], ['[CLS]']],
|
||||
],
|
||||
lower_case=False,
|
||||
vocab_list=vocab_bert,
|
||||
preserve_unused_token=True,
|
||||
keep_whitespace=True,
|
||||
unknown_token=''
|
||||
),
|
||||
dict(
|
||||
first=8,
|
||||
last=8,
|
||||
expect_str=[
|
||||
[['unused'], [' '], ['['], ['CLS'], [']']],
|
||||
],
|
||||
lower_case=False,
|
||||
vocab_list=vocab_bert,
|
||||
preserve_unused_token=False,
|
||||
keep_whitespace=True,
|
||||
unknown_token=''
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def check_bert_tokenizer(first, last, expect_str,
|
||||
vocab_list,
|
||||
suffix_indicator='##',
|
||||
max_bytes_per_token=100, unknown_token='[UNK]',
|
||||
lower_case=False, keep_whitespace=False,
|
||||
normalization_form=nlp.utils.NormalizeForm.NONE,
|
||||
preserve_unused_token=False):
|
||||
dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False)
|
||||
if first > 1:
|
||||
dataset = dataset.skip(first - 1)
|
||||
if last >= first:
|
||||
dataset = dataset.take(last - first + 1)
|
||||
vocab = nlp.Vocab.from_list(vocab_list)
|
||||
tokenizer_op = nlp.BertTokenizer(
|
||||
vocab=vocab, suffix_indicator=suffix_indicator,
|
||||
max_bytes_per_token=max_bytes_per_token, unknown_token=unknown_token,
|
||||
lower_case=lower_case, keep_whitespace=keep_whitespace,
|
||||
normalization_form=normalization_form,
|
||||
preserve_unused_token=preserve_unused_token)
|
||||
dataset = dataset.map(operations=tokenizer_op)
|
||||
count = 0
|
||||
for i in dataset.create_dict_iterator():
|
||||
text = nlp.to_str(i['text'])
|
||||
logger.info("Out:", text)
|
||||
logger.info("Exp:", expect_str[count])
|
||||
np.testing.assert_array_equal(text, expect_str[count])
|
||||
count = count + 1
|
||||
|
||||
|
||||
def test_bert_tokenizer():
|
||||
"""
|
||||
Test WordpieceTokenizer
|
||||
"""
|
||||
for paras in test_paras:
|
||||
check_bert_tokenizer(**paras)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_bert_tokenizer()
|
|
@ -15,11 +15,15 @@
|
|||
"""
|
||||
Testing UnicodeCharTokenizer op in DE
|
||||
"""
|
||||
import numpy as np
|
||||
import mindspore.dataset as ds
|
||||
from mindspore import log as logger
|
||||
import mindspore.dataset.text as nlp
|
||||
|
||||
DATA_FILE = "../data/dataset/testTokenizerData/1.txt"
|
||||
NORMALIZE_FILE = "../data/dataset/testTokenizerData/normalize.txt"
|
||||
REGEX_REPLACE_FILE = "../data/dataset/testTokenizerData/regex_replace.txt"
|
||||
REGEX_TOKENIZER_FILE = "../data/dataset/testTokenizerData/regex_tokenizer.txt"
|
||||
|
||||
|
||||
def split_by_unicode_char(input_strs):
|
||||
|
@ -48,5 +52,182 @@ def test_unicode_char_tokenizer():
|
|||
assert split_by_unicode_char(input_strs) == tokens
|
||||
|
||||
|
||||
def test_whitespace_tokenizer():
|
||||
"""
|
||||
Test WhitespaceTokenizer
|
||||
"""
|
||||
whitespace_strs = [["Welcome", "to", "Beijing!"],
|
||||
["北京欢迎您!"],
|
||||
["我喜欢English!"],
|
||||
[""]]
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
tokenizer = nlp.WhitespaceTokenizer()
|
||||
dataset = dataset.map(operations=tokenizer)
|
||||
tokens = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
text = nlp.to_str(i['text']).tolist()
|
||||
tokens.append(text)
|
||||
logger.info("The out tokens is : {}".format(tokens))
|
||||
assert whitespace_strs == tokens
|
||||
|
||||
|
||||
def test_unicode_script_tokenizer():
|
||||
"""
|
||||
Test UnicodeScriptTokenizer when para keep_whitespace=False
|
||||
"""
|
||||
unicode_script_strs = [["Welcome", "to", "Beijing", "!"],
|
||||
["北京欢迎您", "!"],
|
||||
["我喜欢", "English", "!"],
|
||||
[""]]
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=False)
|
||||
dataset = dataset.map(operations=tokenizer)
|
||||
|
||||
tokens = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
text = nlp.to_str(i['text']).tolist()
|
||||
tokens.append(text)
|
||||
logger.info("The out tokens is : {}".format(tokens))
|
||||
assert unicode_script_strs == tokens
|
||||
|
||||
|
||||
def test_unicode_script_tokenizer2():
|
||||
"""
|
||||
Test UnicodeScriptTokenizer when para keep_whitespace=True
|
||||
"""
|
||||
unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
|
||||
["北京欢迎您", "!"],
|
||||
["我喜欢", "English", "!"],
|
||||
[" "]]
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=True)
|
||||
dataset = dataset.map(operations=tokenizer)
|
||||
tokens = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
text = nlp.to_str(i['text']).tolist()
|
||||
tokens.append(text)
|
||||
logger.info("The out tokens is :", tokens)
|
||||
assert unicode_script_strs2 == tokens
|
||||
|
||||
|
||||
def test_case_fold():
|
||||
"""
|
||||
Test CaseFold
|
||||
"""
|
||||
expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "]
|
||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||
op = nlp.CaseFold()
|
||||
dataset = dataset.map(operations=op)
|
||||
|
||||
lower_strs = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
text = nlp.to_str(i['text']).tolist()
|
||||
lower_strs.append(text)
|
||||
assert lower_strs == expect_strs
|
||||
|
||||
|
||||
def test_normalize_utf8():
|
||||
"""
|
||||
Test NormalizeUTF8
|
||||
"""
|
||||
|
||||
def normalize(normalize_form):
|
||||
dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False)
|
||||
normalize = nlp.NormalizeUTF8(normalize_form=normalize_form)
|
||||
dataset = dataset.map(operations=normalize)
|
||||
out_bytes = []
|
||||
out_texts = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
out_bytes.append(i['text'])
|
||||
out_texts.append(nlp.to_str(i['text']).tolist())
|
||||
logger.info("The out bytes is : ", out_bytes)
|
||||
logger.info("The out texts is: ", out_texts)
|
||||
return out_bytes
|
||||
|
||||
expect_normlize_data = [
|
||||
# NFC
|
||||
[b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
|
||||
b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xe1\xba\x9b\xcc\xa3'],
|
||||
# NFKC
|
||||
[b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
|
||||
b'fi', b'25', b'\xe1\xb9\xa9'],
|
||||
# NFD
|
||||
[b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
|
||||
b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xc5\xbf\xcc\xa3\xcc\x87'],
|
||||
# NFKD
|
||||
[b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
|
||||
b'fi', b'25', b's\xcc\xa3\xcc\x87']
|
||||
]
|
||||
assert normalize(nlp.utils.NormalizeForm.NFC) == expect_normlize_data[0]
|
||||
assert normalize(nlp.utils.NormalizeForm.NFKC) == expect_normlize_data[1]
|
||||
assert normalize(nlp.utils.NormalizeForm.NFD) == expect_normlize_data[2]
|
||||
assert normalize(nlp.utils.NormalizeForm.NFKD) == expect_normlize_data[3]
|
||||
|
||||
|
||||
def test_regex_replace():
|
||||
"""
|
||||
Test RegexReplace
|
||||
"""
|
||||
|
||||
def regex_replace(first, last, expect_str, pattern, replace):
|
||||
dataset = ds.TextFileDataset(REGEX_REPLACE_FILE, shuffle=False)
|
||||
if first > 1:
|
||||
dataset = dataset.skip(first - 1)
|
||||
if last >= first:
|
||||
dataset = dataset.take(last - first + 1)
|
||||
replace_op = nlp.RegexReplace(pattern, replace)
|
||||
dataset = dataset.map(operations=replace_op)
|
||||
out_text = []
|
||||
for i in dataset.create_dict_iterator():
|
||||
text = nlp.to_str(i['text']).tolist()
|
||||
out_text.append(text)
|
||||
logger.info("Out:", out_text)
|
||||
logger.info("Exp:", expect_str)
|
||||
assert expect_str == out_text
|
||||
|
||||
regex_replace(1, 2, ['H____ W____', "L__'_ G_"], "\\p{Ll}", '_')
|
||||
regex_replace(3, 5, ['hello', 'world', '31:beijing'], "^(\\d:|b:)", "")
|
||||
regex_replace(6, 6, ["WelcometoChina!"], "\\s+", "")
|
||||
regex_replace(7, 8, ['我不想长大', 'WelcometoShenzhen!'], "\\p{Cc}|\\p{Cf}|\\s+", "")
|
||||
|
||||
|
||||
def test_regex_tokenizer():
|
||||
"""
|
||||
Test RegexTokenizer
|
||||
"""
|
||||
|
||||
def regex_tokenizer(first, last, expect_str, delim_pattern, keep_delim_pattern):
|
||||
dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
|
||||
if first > 1:
|
||||
dataset = dataset.skip(first - 1)
|
||||
if last >= first:
|
||||
dataset = dataset.take(last - first + 1)
|
||||
tokenizer_op = nlp.RegexTokenizer(delim_pattern, keep_delim_pattern)
|
||||
dataset = dataset.map(operations=tokenizer_op)
|
||||
out_text = []
|
||||
count = 0
|
||||
for i in dataset.create_dict_iterator():
|
||||
text = nlp.to_str(i['text']).tolist()
|
||||
np.testing.assert_array_equal(text, expect_str[count])
|
||||
count += 1
|
||||
out_text.append(text)
|
||||
logger.info("Out:", out_text)
|
||||
logger.info("Exp:", expect_str)
|
||||
|
||||
regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], "\\s+", "")
|
||||
regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], "\\s+", "\\s+")
|
||||
regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], r"\p{Han}", r"\p{Han}")
|
||||
regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+")
|
||||
regex_tokenizer(3, 3, [['12', '36']], r"[\p{P}|\p{S}]+", "")
|
||||
regex_tokenizer(3, 3, [['¥+', '¥=?']], r"[\p{N}]+", "")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_unicode_char_tokenizer()
|
||||
test_whitespace_tokenizer()
|
||||
test_unicode_script_tokenizer()
|
||||
test_unicode_script_tokenizer2()
|
||||
test_case_fold()
|
||||
test_normalize_utf8()
|
||||
test_regex_replace()
|
||||
test_regex_tokenizer()
|
||||
|
|
|
@ -0,0 +1,113 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""
|
||||
Testing WordpieceTokenizer op in DE
|
||||
"""
|
||||
import numpy as np
|
||||
import mindspore.dataset as ds
|
||||
from mindspore import log as logger
|
||||
import mindspore.dataset.text as nlp
|
||||
|
||||
WORDPIECE_TOKENIZER_FILE = "../data/dataset/testTokenizerData/wordpiece_tokenizer.txt"
|
||||
|
||||
vocab_english = [
|
||||
"book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"
|
||||
]
|
||||
|
||||
vocab_chinese = [
|
||||
"我", '最', '喜', '欢', '的', '书', '是', '霍', '乱', '时', '期', '爱', '情'
|
||||
]
|
||||
|
||||
vocab_mix = vocab_chinese + vocab_english
|
||||
|
||||
test_paras = [
|
||||
dict(
|
||||
first=1,
|
||||
last=10,
|
||||
expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'],
|
||||
['era'], ['[UNK]']],
|
||||
vocab_list=vocab_english
|
||||
),
|
||||
dict(
|
||||
first=1,
|
||||
last=10,
|
||||
expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'],
|
||||
['era'], ['what']],
|
||||
vocab_list=vocab_english,
|
||||
unknown_token=""
|
||||
),
|
||||
dict(
|
||||
first=1,
|
||||
last=10,
|
||||
expect_str=[['my'], ['[UNK]'], ['book'], ['is'], ['love'], ['[UNK]'], ['the'], ['[UNK]'], ['era'], ['[UNK]']],
|
||||
vocab_list=vocab_english,
|
||||
max_bytes_per_token=4
|
||||
),
|
||||
dict(
|
||||
first=11,
|
||||
last=25,
|
||||
expect_str=[['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'],
|
||||
['[UNK]']],
|
||||
vocab_list=vocab_chinese,
|
||||
),
|
||||
dict(
|
||||
first=25,
|
||||
last=25,
|
||||
expect_str=[['您']],
|
||||
vocab_list=vocab_chinese,
|
||||
unknown_token=""
|
||||
),
|
||||
dict(
|
||||
first=1,
|
||||
last=25,
|
||||
expect_str=[
|
||||
['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], ['era'],
|
||||
['[UNK]'],
|
||||
['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'],
|
||||
['[UNK]']],
|
||||
vocab_list=vocab_mix,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def check_wordpiece_tokenizer(first, last, expect_str, vocab_list, unknown_token='[UNK]', max_bytes_per_token=100):
|
||||
dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False)
|
||||
if first > 1:
|
||||
dataset = dataset.skip(first - 1)
|
||||
if last >= first:
|
||||
dataset = dataset.take(last - first + 1)
|
||||
vocab = nlp.Vocab.from_list(vocab_list)
|
||||
tokenizer_op = nlp.WordpieceTokenizer(vocab=vocab, unknown_token=unknown_token,
|
||||
max_bytes_per_token=max_bytes_per_token)
|
||||
dataset = dataset.map(operations=tokenizer_op)
|
||||
count = 0
|
||||
for i in dataset.create_dict_iterator():
|
||||
text = nlp.to_str(i['text'])
|
||||
logger.info("Out:", text)
|
||||
logger.info("Exp:", expect_str[count])
|
||||
np.testing.assert_array_equal(text, expect_str[count])
|
||||
count = count + 1
|
||||
|
||||
|
||||
def test_wordpiece_tokenizer():
|
||||
"""
|
||||
Test WordpieceTokenizer
|
||||
"""
|
||||
for paras in test_paras:
|
||||
check_wordpiece_tokenizer(**paras)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_wordpiece_tokenizer()
|
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"strategy": "additive",
|
||||
"featureFilters": {
|
||||
"normalization": "include"
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue